Global well-being in 2025: A multidimensional analysis of mental, financial, and social health in 92 countries
APPENDIX
This appendix includes all of the execution steps used in analyzing the data, from preprocessing with the raw public data (A0) to statistical analysis (A1 to A23).
In order to reproduce these steps, it is necessary to place all of the following files in the same directory as 000_analysis_script.qmd. Moreover, a revn.lock file and a Docker image were provided to ensure that the analysis can be executed within the same software environment used by the authors.
- 999_public_data.rds
- 999_irish_sponsored_public.csv
- 111_ethnicity_labels_translated.csv
- 111_ethnicity_open_answers_recoded.csv
- 111_education_recoded.csv
- 111_sex_open_answers_recoded.csv
- 111_country_variables.csv
- 111_ip_repeated.csv
- 111_administrative_location.csv
- 111_response_ids_botnets.csv
- 111_income_recoded.rds
- 111_generic_version_country.csv
- 222_codebook.xlsx
- 777_countries_documentation (folder)
- revn.lock (optional)
- qmd-env-arm64.tar (optional)
To conduct the analyses (A1 to A23) without running the preprocessing steps, place 999_clean_data.rds in the same directory, run the Setup section, and start at any Analyses section. Each analysis section is independent and can be run separately.
Setup
Set working directory
Load packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(char = c(
"MetBrewer",
"ggridges",
"metafor",
"ggtext",
"binom",
"mgcv", # Generalized Additive Models
"interactions", # Interaction plots
"lsr", # effect size calculations
"survey", # weighted analysis
"corrplot", # Correlation plots
"ggh4x", # Advanced ggplot2 facets
"htmltools", # Create HTML content
"sf", # Handle spatial data
"rnaturalearth", # Obtain map data
"rnaturalearthdata", # Obtain map data
"gridExtra", # Arrange multiple plots
"grid", # Arrange multiple plots
"gtable", # Arrange multiple plots
"ggplotify", # Convert plots to grobs
"qualtRics", # Read files obtained through Qualtrics
"readr", # Write csv files
"readxl", # Read excel files
"flextable", # Create Word documents
"officer", # Create Word documents
"dplyr", # Manipulate data during preprocessing
"tidyr", # Manipulate data during preprocessing
"stringr", # Manipulate strings during preprocessing
"janitor", # Clean and manage data during preprocessing
"ggplot2", # Create plots
"ggfx", # Add drop shadow effect on elements in a plot
"psych", # Conduct reliability tests
"car", # Conduct Anova tests on models
"emmeans", # Perform contrast analysis
"lme4", # Run Linear Mixed Models
"kableExtra", # Display tables in HTML format
"sjPlot", # Generate advanced tables for models
"report", # Generate advanced reporting for linear models
"performance", # Generate advanced reporting for linear models
"Hmisc", # Generate advanced reporting and weighted statistics
"semTools", # For McDonald's omega
"lavaan", # Needed for semTools
"rmcorr", # Multilevel correlations
"ggeffects", # Generate plots from marginal effects
"tibble", # Data wrangling
"purrr", # Data wrangling
"forcats", # Data wrangling
"see", # Data visualization
"broom.mixed", # Data wrangling
"showtext", # Custom fonts in plots
"ggflags", # Country flags in plots
"scales", # Scale functions for ggplot2
"countrycode", # Convert country names to different coding schemes
"cowplot", # Combine multiple ggplots into one figure
"sessioninfo", # Report session info
"rlang",
"visdat", # Visualise missing data
"labelled", # Handle variable labels
"sysfonts", # Custom fonts in plots
"reactable", # Create interactive tables
"weights", # Weighted statistics
"leaflet", # Interactive maps
"leaflet.extras", # Interactive maps
"leaflet.extras2",
"lubridate", # Work with dates
"stringr", # Manipulate strings during preprocessing
"cowplot"
))Define global settings
options(
# Remove scientific notation
scipen = 999,
width = 150,
# Clean up dplyr messages
dplyr.summarise.inform = FALSE)
# Set up theme for plots
sysfonts::font_add_google("Inter")
showtext::showtext_auto()
theme_gmh <-
ggplot2::theme_minimal(base_family = "Inter", base_size = 12) +
ggplot2::theme(
text = element_text(family = "Inter", colour = "#051520"),
axis.text.y = element_text(color = "#051520"),
axis.text.x = element_text(
color = "#051520",
margin = margin(t = 1),
face = "bold"
),
axis.title.x = element_text(color = "#051520", face = "bold"),
axis.title.y = element_text(color = "#051520", face = "bold"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line.x = element_line(colour = "#051520", linewidth = 0.4),
plot.margin = margin(6, 6, 6, 6),
plot.subtitle = ggplot2::element_text(color = "#051520"),
plot.background = ggplot2::element_rect(fill = "transparent", color = NA),
panel.background = ggplot2::element_rect(fill = "transparent", color = NA),
legend.background = ggplot2::element_rect(fill = "transparent", color = NA)
)
ggplot2::theme_set(theme_gmh)
# Print variables in a tidy way
table_label <- function(col) {
# extract what is after $ in dataframe$column
name <- sub(".*\\$(.+)", "\\1", deparse(substitute(col)))
# extract the label of the given column
lab <- attr(col, "label")
# print header wih column name and label
cat(sprintf("$%s\n%s\n", name, lab))
# print table output with NA counts
tbl <- table(col, useNA = "always")
names(dimnames(tbl)) <- NULL
print(tbl)
# print the class of the column
cat("Class:", paste(class(col), collapse = ", "), "\n")
}
# Print a pretty table
print_reactable <- function(data, sorted_col, width) {
reactable::reactable(
data,
pagination = FALSE,
height = 650,
width = width,
defaultSorted = sorted_col,
defaultSortOrder = "asc",
searchable = TRUE,
striped = TRUE,
compact = TRUE,
highlight = TRUE,
defaultColGroup = reactable::colGroup(headerVAlign = "bottom"),
defaultColDef = reactable::colDef(
vAlign = "center",
headerVAlign = "bottom",
class = "cell",
headerClass = "header"
)
)
}
# Print pretty summaries
print_summ <- function(model, design, var, term) {
format_p <- function(p) {
if (p < 0.001) {
return("< .001")
}
base::format(base::round(p, 3), nsmall = 3)
}
term_test <- survey::regTermTest(model, term)
svy_resid <-
update(design, .resid = stats::residuals(model, type = "response"))
var_y <-
survey::svyvar(stats::as.formula(paste0("~", var)), design = svy_resid)[1]
var_e <- survey::svyvar(~.resid, design = svy_resid)[1]
r2 <- 1 - (var_e / var_y)
cohens_f <- base::sqrt(r2 / (1 - r2))
percent_var_explained <- r2 * 100
tibble::tibble(
Ward_F =
base::format(base::round(base::as.numeric(term_test$Ftest[1]), 2), nsmall = 2),
df1 = term_test$df,
df2 = term_test$ddf,
p = format_p(term_test$p),
r2 = base::format(base::round(r2, 4), nsmall = 4),
cohens_f = base::format(base::round(cohens_f, 4), nsmall = 4),
percent_var_explained =
base::format(base::round(percent_var_explained, 4), nsmall = 4)
)
}
# Calculate weighted correlation
weighted_corr <- function(dat, var_x, var_y, multiple = FALSE) {
if (!isTRUE(multiple)) {
var_x <- rlang::ensym(var_x)
var_y <- rlang::ensym(var_y)
design <- survey::svydesign(
ids = ~ 1,
weights = ~ ps_weight,
data = dat
)
est <- jtools::svycor(
stats::as.formula(
paste0("~", rlang::as_name(var_x), " + ", rlang::as_name(var_y))),
design,
sig.stats = TRUE,
bootn = 1000,
mean1 = TRUE
)
data.frame(
r = as.character(format(round(est$cors[2], 3), nsmall = 3)),
t = as.character(format(round(est$t.values[2], 2), nsmall = 2)),
p = dplyr::if_else(
est$p.values[2] < 0.01, "<.001",
as.character(format(round(est$p.values[2], 3), nsmall = 3)))
)
} else {
outcome_sym <- rlang::ensym(var_x)
items_val <- rlang::eval_tidy(rlang::enquo(var_y))
design <- survey::svydesign(ids = ~1, weights = ~ps_weight, data = dat)
results <- purrr::map_dfr(items_val, function(item_name) {
f <- stats::as.formula(paste0("~", rlang::as_name(outcome_sym), " + ", item_name))
est <- jtools::svycor(f, design, sig.stats = TRUE, bootn = 1000, mean1 = TRUE)
r_val <- est$cors[2]
t_val <- est$t.values[2]
p_val <- est$p.values[2]
data.frame(
item = item_name,
r = as.character(format(round(est$cors[2], 3), nsmall = 3)),
t = as.character(format(round(est$t.values[2], 2), nsmall = 2)),
p = dplyr::if_else(
est$p.values[2] < 0.01, "<.001",
as.character(format(round(est$p.values[2], 3), nsmall = 3)))
)
})
return(results)
}
}
# Define MPWB items and labels
mpwb_items <- c(
"mpwb_positive_relationships",
"mpwb_meaning",
"mpwb_competence",
"mpwb_engagement",
"mpwb_self_esteem",
"mpwb_optimism",
"mpwb_positive_emotion",
"mpwb_emotional_stability",
"mpwb_resilience",
"mpwb_vitality"
)
mpwb_labels <- c(
mpwb_positive_relationships = "Positive relationships",
mpwb_meaning = "Meaning",
mpwb_competence = "Competence",
mpwb_engagement = "Engagement",
mpwb_self_esteem = "Self-esteem",
mpwb_optimism = "Optimism",
mpwb_positive_emotion = "Positive emotion",
mpwb_emotional_stability = "Emotional stability",
mpwb_resilience = "Resilience",
mpwb_vitality = "Vitality"
)
phq4_items <- c("phq_interest", "phq_down", "gad_anxious", "gad_worry")
# Define EU countries
eu_countries <- c(
"Austria",
"Belgium",
"Bulgaria",
"Croatia",
"Cyprus",
"Czech Republic",
"Denmark",
"Estonia",
"Finland",
"France",
"Germany",
"Greece",
"Hungary",
"Ireland",
"Italy",
"Latvia",
"Netherlands",
"Poland",
"Portugal",
"Romania",
"Slovakia",
"Slovenia",
"Spain",
"Sweden"
)
# List of countries whose weight scores were replaced by 1.
flagged_countries <-
c("Moldova", "Romania", "Nigeria", "Montenegro", "Angola",
"Morocco", "Uruguay", "Paraguay", "Greece", "Iran",
"Hungary", "Kosovo", "Yemen", "Chile", "Uganda")Load data
The data collection began on June 2, 2025, with a soft-launch phase. The survey’s time zone was set to New York City. Due to time zone differences, some responses show a date of June 1, 2025, even though it was already June 2 in the collaborators’ local time. Some collaborators were residing in countries different from their target country.
# Raw public dataset
df_pub_raw <- base::readRDS("999_public_data.rds")
# View number of rows in the raw dataset
nrow(df_pub_raw)[1] 68311
# Cleaned dataset
df_gmh <- base::readRDS("999_cleaned_data.rds")
# Create general design
svy <- survey::svydesign(ids = ~ 1, weights = ~ ps_weight, data = df_gmh)
# View number of rows in the cleaned dataset
nrow(df_gmh)[1] 53799
# Codebook
codebook <- readxl::read_excel(
path = "222_codebook.xlsx",
sheet = "df_cleaned",
skip = 1,
col_names = TRUE
)
# TODO
# Verify that items regarding individual location are being removed in the black box
# Think if duration adjusted should be 21 + n_items_after or 20 + n_items_after
# Add skimr::skim(df_pub) once the data is cleaned
# Add codebook at the end
# Are income_orig_cat_10 and 11 necessary? If we keep then add ordered factor
# Run a major double checking of financial cleaning variables vs- orig
# add the list of countries were we asked "use digits 0-9" to income section.
# Ask how zimbabwe transformation values were obtained
# make sure the tabs "all together" match the responsesids in the other tabs
# clean history from google sheets.
# make sure that assessment fin does not contain excluded participants.
# use kable to print dataframe added and allow search and so on
# check if any packages is not being used in the load package section
# update max section in the intro
# save all tables and all fig
# run assumption checks for all models
# rename all weighted_n calculated as sums to sum_weight. and calculate effective kirs as weighted_n.
# To correct the anticipated but substantial imbalances in non-representative samples, within-country propensity weighting was applied using national population benchmarks. Expected population proportions for age, gender, and educational attainment (25+) using sex and age estimates from the United States Census Bureau’s International Database 65, educational attainment estimates sourced using the UNESCO data browser 66, some countries required specific equivalent sources (see Appendix A21 for full details) were used to reweight responses using the WeightIt (WeightIt, v1.5.0) framework 67 via gradient boosted trees with stabilized weights for robustness (see Appendix A21). A0. Data Preprocessing
A0.1. Cleaning the dataset
Rename columns
df_pub <- df_pub_raw |>
dplyr::rename(
duration_sec = `Duration (in seconds)`,
mpwb_competence = Q5,
mpwb_emotional_stability = Q7,
mpwb_engagement = Q9,
mpwb_meaning = Q11,
mpwb_optimism = Q13,
mpwb_positive_emotion = Q15,
mpwb_positive_relationships = Q17,
mpwb_resilience = Q19,
mpwb_self_esteem = Q21,
mpwb_vitality = Q23,
life_satisfaction = Q29,
income_orig = Q31,
income_text_orig = Q31_10_TEXT,
household_size = Q32,
birth_year_orig = Q25,
sex_orig = Q26,
education_orig = Q27,
employment_orig = Q28,
ethnicity_citizenship_orig = Q30,
assets_orig = Q34,
debts_orig = Q33,
bot_check = Q43,
followup = Q35,
phq_interest = Q36_1,
phq_down = Q36_2,
gad_anxious = Q36_3,
gad_worry = Q36_4,
childhood_SES = Q37,
fin_outlook = Q38,
fin_outlook_conf = Q39,
attention_care = Q40,
work_arrangement = Q41
) |>
dplyr::relocate(Q_Language, .after = UserLanguage) |>
# Overview of the data
dplyr::glimpse(width = 100)Rows: 68,311
Columns: 45
$ StartDate <dttm> 2025-06-01 07:14:43, 2025-06-01 07:33:44, 2025-06-01 19:24:40…
$ EndDate <dttm> 2025-06-01 07:31:52, 2025-06-01 07:41:06, 2025-06-01 19:30:50…
$ Status <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Progress <dbl> 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 10…
$ duration_sec <dbl> 1028, 442, 370, 426, 512, 344, 341, 744, 582, 1006, 233, 173, …
$ Finished <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ RecordedDate <dttm> 2025-06-01 07:31:52, 2025-06-01 07:41:07, 2025-06-01 19:30:52…
$ ResponseId <chr> "R_2i29tTIFUyYilqv", "R_2nemeLi6AnL1uNP", "R_3LqMY0lbugweTSh",…
$ UserLanguage <chr> "FR-SEN", "FR-SEN", "PT-BRA", "PT-BRA", "PT-BRA", "PT-BRA", "P…
$ Q_Language <chr> "FR-SEN", "FR-SEN", "PT-BRA", "PT-BRA", "PT-BRA", "PT-BRA", "P…
$ mpwb_competence <dbl> 6, 5, 5, 5, 5, 5, 5, 7, 5, 5, 4, 7, 5, 5, 5, 4, 6, 6, 5, 5, 7,…
$ mpwb_emotional_stability <dbl> 6, 3, 5, 5, 5, 4, 5, 7, 6, 7, 5, 5, 4, 7, 5, 4, 7, 6, 4, 5, 5,…
$ mpwb_engagement <dbl> 6, 6, 5, 6, 5, 4, 5, 7, 3, 5, 5, 6, 7, 7, 5, 4, 4, 5, 5, 3, 7,…
$ mpwb_meaning <dbl> 6, 3, 5, 6, 4, 4, 6, 7, 5, 6, 4, 5, 4, 6, 5, 4, 7, 5, 4, 5, 7,…
$ mpwb_optimism <dbl> 7, 5, 5, 6, 5, 3, 7, 7, 6, 7, 4, 6, 5, 6, 5, 4, 7, 5, 6, 6, 7,…
$ mpwb_positive_emotion <dbl> 5, 3, 5, 6, 5, 7, 6, 7, 7, 5, 4, 5, 6, 6, 4, 4, 7, 5, 5, 5, 7,…
$ mpwb_positive_relationships <dbl> 5, 5, 5, 4, 7, 7, 6, 7, 5, 7, 5, 6, 5, 6, 5, 4, 6, 5, 4, 3, 7,…
$ mpwb_resilience <dbl> 5, 5, 5, 6, 5, 3, 6, 7, 5, 7, 4, 6, 4, 6, 4, 4, 7, 7, 4, 3, 6,…
$ mpwb_self_esteem <dbl> 6, 5, 5, 7, 3, 4, 7, 7, 6, 5, 4, 7, 5, 6, 5, 4, 6, 6, 6, 6, 7,…
$ mpwb_vitality <dbl> 5, 1, 4, 5, 3, 4, 5, 7, 6, 7, 4, 6, 4, 6, 5, 4, 5, 5, 5, 5, 5,…
$ life_satisfaction <dbl> 7, 6, 9, 8, 5, 8, 7, 10, 8, 10, 0, 8, 6, 9, 6, 5, 10, 8, 7, 8,…
$ income_orig <dbl> 7, 9, 6, 9, 7, 8, 5, 4, 5, 7, 1, 5, 4, 6, 7, 6, 10, 10, 6, 3, …
$ income_text_orig <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ household_size <dbl> 3, 5, 1, 4, 5, 4, 2, 4, 2, 1, 1, 6, 5, 6, 3, 4, 10, 7, 1, 12, …
$ birth_year_orig <chr> "1989", "1984", "1971", "1986", "1993", "2005", "1986", "1975"…
$ sex_orig <dbl> 2, 2, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 2,…
$ education_orig <dbl> 6, 6, 5, 8, 6, 3, 7, 5, 7, 6, 5, 8, 5, 7, 7, 6, 5, 6, 5, 4, 7,…
$ employment_orig <chr> "3", "3", "3", "3", "8", "2,8", "3", "3", "1", "6", "8", "3", …
$ ethnicity_citizenship_orig <chr> "3,6,10", "1,10", "5,10", "5,10", "3,10", "5,10", "1,10", "1,1…
$ assets_orig <chr> "5", "2", "20.000", "1000000", "5000", "0,00", "250000", "1,00…
$ debts_orig <chr> "10000000", "2", "18000", "0", "125000", "0,00", "0", "1,000.0…
$ bot_check <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ followup <dbl> 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1,…
$ phq_interest <dbl> NA, 2, NA, 2, 3, 6, 1, NA, 2, 1, NA, 2, NA, 1, 2, 2, NA, 1, 2,…
$ phq_down <dbl> NA, 3, NA, 1, 3, 4, 1, NA, 1, 1, NA, 2, NA, 2, 2, 2, NA, 1, 2,…
$ gad_anxious <dbl> NA, 2, NA, 2, 3, 7, 2, NA, 2, 1, NA, 3, NA, 2, 3, 2, NA, 1, 1,…
$ gad_worry <dbl> NA, 1, NA, 2, 3, 7, 1, NA, 1, 1, NA, 3, NA, 2, 3, 2, NA, 1, 2,…
$ childhood_SES <dbl> NA, 4, NA, 4, 2, 4, 1, NA, 1, 4, NA, 2, NA, 4, 4, 3, NA, 3, 4,…
$ fin_outlook <dbl> NA, 3, NA, 4, 4, 5, 5, NA, 5, 4, NA, 5, NA, 4, 5, 5, NA, 4, 5,…
$ fin_outlook_conf <dbl> NA, 10, NA, 8, 8, 10, 10, NA, 10, 8, NA, 8, NA, 9, 7, 8, NA, 1…
$ attention_care <dbl> NA, 5, NA, 5, 7, 4, 5, NA, 5, 6, NA, 5, NA, 5, 6, 5, NA, 5, 4,…
$ work_arrangement <dbl> NA, 4, NA, 3, NA, 1, 2, NA, 5, NA, NA, 1, NA, 1, 3, 4, NA, 1, …
$ br <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ bs <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ CoreMPWB_DO <chr> "Q4|Q23|Q6|Q21|Q8|Q15|Q10|Q19|Q12|Q9|Q14|Q5|Q16|Q7|Q18|Q17|Q20…
Identification of the Countries
$UserLanguage
User Language
AM-ARM AM-ETH AR-ARE AR-BHR AR-DZA AR-EGY AR-KWT AR-LBN AR-MAR AR-OMN AR-QAT AR-SAU AR-TCD AR-YEM BG-BGR BN-BGD BS-BIH CNR-MNE
334 303 66 100 203 322 106 416 302 520 503 296 7 577 393 536 642 358
CS-CZE DA-DNK DE-AUT DE-CHE DE-DEU EL-CYP EL-GRC EN EN-ARE EN-AUS EN-BHR EN-CAN EN-EGY EN-EST EN-ETH EN-GBR EN-GEN EN-GEO
267 338 685 452 1008 218 532 5549 270 605 111 535 547 9 249 852 69 54
EN-HKG EN-HUN EN-IDN EN-IND EN-IRL EN-KOR EN-KWT EN-MNG EN-MYS EN-NGA EN-NLD EN-NOR EN-PAK EN-PHL EN-QAT EN-SGP EN-UGA EN-YEM
17 6 12 921 461 11 209 40 203 721 161 26 347 2280 23 298 332 3
EN-ZAF EN-ZMB EN-ZWE ES-ARG ES-BOL ES-CHL ES-ECU ES-ESP ES-MEX ES-PER ES-PRY ES-URY ES-US ET-EST FA-IRN FI-FIN FIL-PHL FR-BEL
279 34 169 769 341 240 1075 729 1164 1031 205 815 159 2393 292 275 1276 70
FR-CAN FR-CHE FR-FRA FR-MDG FR-SEN FR-TCD HE-ISR HI-IND HR-HRV HU-HUN ID-IDN IT-CHE IT-ITA JA-JPN KA-GEO KK-KAZ KO-KOR KY-KGZ
339 292 1175 169 211 185 437 706 455 729 1489 79 566 549 450 131 481 166
LV-LVA MK-MKD MN-MNG MS-MYS NL-BEL NL-NLD NO-NOR PL-POL PT-AGO PT-BRA PT-MOZ PT-PRT PT-TLS RO-MDA RO-ROU RU-KAZ RU-KGZ RU-RUS
1023 268 327 613 261 287 483 1288 329 2094 154 579 277 511 861 656 209 1322
RU-UZB SK-SVK SL-SVN SN-ZWE SQI-ALB SQI-XKX SR-SRB SR-XKX SV-SWE TH-THA TR-TUR UK-UKR UR-PAK UZ-UZB ZH-CHN ZH-HKG ZH-TWN <NA>
119 724 746 106 2284 1371 420 2 1149 440 682 749 160 543 2523 220 201 0
Class: character
# Create column with country names mapped from UserLanguage
country_map <- c(
"SQI-ALB" = "Albania",
"AR-DZA" = "Algeria",
"PT-AGO" = "Angola",
"ES-ARG" = "Argentina",
"AM-ARM" = "Armenia",
"EN-AUS" = "Australia",
"DE-AUT" = "Austria",
"AR-BHR" = "Bahrain",
"EN-BHR" = "Bahrain",
"BN-BGD" = "Bangladesh",
"FR-BEL" = "Belgium",
"NL-BEL" = "Belgium",
"ES-BOL" = "Bolivia",
"BS-BIH" = "Bosnia-Herzegovina",
"PT-BRA" = "Brazil",
"BG-BGR" = "Bulgaria",
"EN-CAN" = "Canada",
"FR-CAN" = "Canada",
"AR-TCD" = "Chad",
"FR-TCD" = "Chad",
"ES-CHL" = "Chile",
"ZH-CHN" = "China",
"HR-HRV" = "Croatia",
"EL-CYP" = "Cyprus",
"CS-CZE" = "Czech Republic",
"DA-DNK" = "Denmark",
"ES-ECU" = "Ecuador",
"AR-EGY" = "Egypt",
"EN-EGY" = "Egypt",
"EN-EST" = "Estonia",
"ET-EST" = "Estonia",
"AM-ETH" = "Ethiopia",
"EN-ETH" = "Ethiopia",
"FR-FRA" = "France",
"FI-FIN" = "Finland",
"EN-GEO" = "Georgia",
"KA-GEO" = "Georgia",
"DE-DEU" = "Germany",
"EL-GRC" = "Greece",
"EN-HKG" = "Hong Kong",
"ZH-HKG" = "Hong Kong",
"EN-HUN" = "Hungary",
"HU-HUN" = "Hungary",
"EN-IND" = "India",
"HI-IND" = "India",
"ID-IDN" = "Indonesia",
"EN-IDN" = "Indonesia",
"FA-IRN" = "Iran",
"EN-IRL" = "Ireland",
"HE-ISR" = "Israel",
"IT-ITA" = "Italy",
"JA-JPN" = "Japan",
"KK-KAZ" = "Kazakhstan",
"RU-KAZ" = "Kazakhstan",
"EN-KOR" = "Republic of Korea",
"KO-KOR" = "Republic of Korea",
"SQI-XKX" = "Kosovo",
"SR-XKX" = "Kosovo",
"AR-KWT" = "Kuwait",
"EN-KWT" = "Kuwait",
"KY-KGZ" = "Kyrgyzstan",
"RU-KGZ" = "Kyrgyzstan",
"LV-LVA" = "Latvia",
"AR-LBN" = "Lebanon",
"MK-MKD" = "North Macedonia",
"FR-MDG" = "Madagascar",
"MS-MYS" = "Malaysia",
"EN-MYS" = "Malaysia",
"ES-MEX" = "Mexico",
"RO-MDA" = "Moldova",
"EN-MNG" = "Mongolia",
"MN-MNG" = "Mongolia",
"CNR-MNE" = "Montenegro",
"AR-MAR" = "Morocco",
"PT-MOZ" = "Mozambique",
"NL-NLD" = "Netherlands",
"EN-NLD" = "Netherlands",
"EN-NGA" = "Nigeria",
"EN-NOR" = "Norway",
"NO-NOR" = "Norway",
"AR-OMN" = "Oman",
"UR-PAK" = "Pakistan",
"EN-PAK" = "Pakistan",
"ES-PRY" = "Paraguay",
"ES-PER" = "Peru",
"EN-PHL" = "Philippines",
"FIL-PHL" = "Philippines",
"PL-POL" = "Poland",
"PT-PRT" = "Portugal",
"AR-QAT" = "Qatar",
"EN-QAT" = "Qatar",
"RO-ROU" = "Romania",
"RU-RUS" = "Russia",
"AR-SAU" = "Saudi Arabia",
"FR-SEN" = "Senegal",
"SR-SRB" = "Serbia",
"EN-SGP" = "Singapore",
"SK-SVK" = "Slovakia",
"SL-SVN" = "Slovenia",
"EN-ZAF" = "South Africa",
"ES-ESP" = "Spain",
"SV-SWE" = "Sweden",
"FR-CHE" = "Switzerland",
"DE-CHE" = "Switzerland",
"IT-CHE" = "Switzerland",
"ZH-TWN" = "Taiwan",
"TH-THA" = "Thailand",
"PT-TLS" = "Timor-Leste",
"TR-TUR" = "Türkiye",
"EN-UGA" = "Uganda",
"UK-UKR" = "Ukraine",
"AR-ARE" = "UAE",
"EN-ARE" = "UAE",
"EN-GBR" = "UK",
"EN" = "USA",
"ES-US" = "USA",
"ES-URY" = "Uruguay",
"RU-UZB" = "Uzbekistan",
"UZ-UZB" = "Uzbekistan",
"AR-YEM" = "Yemen",
"EN-YEM" = "Yemen",
"EN-ZMB" = "Zambia",
"EN-ZWE" = "Zimbabwe",
"SN-ZWE" = "Zimbabwe",
"EN-GEN" = "Global"
)
df_pub <- df_pub |>
dplyr::mutate(
# Identify country based on UserLanguage
country = country_map[UserLanguage],
# Transform UserLanguage to ISO codes
# (the last three characters identify the ISO3 code except USA)
iso3 = stringr::str_extract(UserLanguage, "[A-Z]{3}$"),
# Clean the code for the USA
iso3 = dplyr::case_when(
UserLanguage == "EN" ~ "USA",
UserLanguage == "ES-US" ~ "USA",
UserLanguage == "EN-GEN" ~ NA_character_,
TRUE ~ iso3
),
# Convert ISO3 to ISO2
iso2 = countrycode::countrycode(
iso3,
origin = "iso3c",
destination = "iso2c",
custom_match = c("XKX" = "XK"))
) |>
dplyr::relocate(country, iso2, iso3, .after = UserLanguage)
# Sanity check: Cross-tab of countries by language
df_pub |>
dplyr::count(country, iso2, iso3, sort = TRUE) |>
dplyr::filter(!is.na(country)) |>
print_reactable(sorted_col = "country", width = 500)Global Version Processing
A global version of the survey was created to ensure people from countries that weren’t specifically targeted in this study or whose native languages weren’t provided could still take part. This version didn’t have any changes made for specific countries. There was only an open-text field for the income item, and all financial items asked for values in USD.
# Identify country and citizenship
gen_ident <-
readr::read_csv("111_generic_version_country.csv", show_col_types = FALSE) |>
dplyr::glimpse(width = 100)Rows: 69
Columns: 2
$ ResponseId <chr> "R_4CJBLtS3qvvRTf7", "R_2EGKdy6ce2zvQls", "R_8f1msPTljX0SGpw", "R_7f1bVmdQG7qh…
$ country_gen <chr> "Australia", "Austria", "Austria", "Bangladesh", "Bangladesh", "Bangladesh", "…
[1] 68311
df_pub <- df_pub |>
dplyr::left_join(gen_ident, by = "ResponseId") |>
dplyr::relocate(country_gen, .after = country)
nrow(df_pub)[1] 68311
# Sanity check: View the country counts of global version participants
# It was not possible to identify the country for one participant
df_pub |> dplyr::filter(UserLanguage == "EN-GEN") |>
dplyr::group_by(country_gen) |>
dplyr::summarise(n = dplyr::n()) |>
base::print(n = Inf)# A tibble: 34 × 2
country_gen n
<chr> <int>
1 Afghanistan 1
2 Australia 1
3 Austria 2
4 Bangladesh 4
5 Belgium 7
6 Bhutan 2
7 Colombia 3
8 Democratic Republic of the Congo 1
9 Dominican Republic 1
10 Finland 1
11 France 3
12 Germany 2
13 Guatemala 1
14 Honduras 1
15 India 2
16 Italy 1
17 Kenya 1
18 Korea 1
19 Lebanon 7
20 Namibia 1
21 Nepal 1
22 New Zealand 2
23 Norway 1
24 Oman 5
25 Pakistan 1
26 Philippines 2
27 Sri Lanka 1
28 Sweden 1
29 Thailand 1
30 UAE 4
31 UK 2
32 Zambia 3
33 Zimbabwe 1
34 <NA> 1
Exclusion of Countries with Small Sample Sizes
We excluded the Global version and Zambia because the sample sizes were not sufficiently large. The Global version does not have the country-specific changes that were made in the target countries, consequently those answers can’t be compared. Zambia is not included because it only has 34 participants, which is less than the 120 required.
# View countries with less than 120 participants
df_pub |>
dplyr::group_by(country) |>
dplyr::summarise(n = dplyr::n()) |>
dplyr::filter(n < 120) |>
dplyr::arrange(n)# A tibble: 2 × 2
country n
<chr> <int>
1 Zambia 34
2 Global 69
[1] 68311
df_pub <- df_pub |>
dplyr::filter(UserLanguage != "EN-GEN", UserLanguage != "EN-ZMB") |>
dplyr::select(-country_gen)
nrow(df_pub)[1] 68208
Location Validation
location <-
readr::read_csv("111_administrative_location.csv", show_col_types = FALSE) |>
dplyr::glimpse(width = 100)Rows: 68,208
Columns: 7
$ ResponseId <chr> "R_4rOh5csuvsUlmsF", "R_9Hk3KD5bE28n9bn", "R_8plovBuEUJfYQRO", "R_5miAsDI8Pi7…
$ loc_country <chr> "Armenia", "Armenia", "Armenia", "Armenia", "Armenia", "Armenia", "Armenia", …
$ loc_resident <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ loc_admin_1 <chr> "Yerevan", "Yerevan", "Yerevan", "Syunik", "Yerevan", "Yerevan", "Yerevan", "…
$ loc_admin_2 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ lat <dbl> 40.18720, 40.18720, 40.18720, 39.50899, 40.18720, 40.18720, 40.18720, 40.1872…
$ long <dbl> 44.51521, 44.51521, 44.51521, 46.34389, 44.51521, 44.51521, 44.51521, 44.5152…
[1] 68208
df_pub <- df_pub |>
dplyr::left_join(location, by = "ResponseId") |>
dplyr::relocate(
loc_resident,
loc_country,
loc_admin_1,
loc_admin_2,
lat,
long,
.after = Q_Language
)
# Sanity check: Number of rows should remain the same
nrow(df_pub)[1] 68208
# Sanity check: How many missing location validations are in the dataset?
nrow(df_pub |> dplyr::filter(is.na(loc_resident)))[1] 0
# Sanity check: How many missing latitudes are in the dataset?
nrow(df_pub |> dplyr::filter(is.na(lat) & !is.na(loc_country)))[1] 0
# Sanity check: View the counts of location validation
df_pub |> dplyr::filter(loc_resident == 0) |>
dplyr::group_by(country, loc_resident) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "country", width = 500)# Sanity check: View the counts of administrative level units per country
df_pub |> dplyr::filter(loc_resident == 1) |>
tidyr::pivot_longer(
cols = c(loc_admin_1, loc_admin_2),
names_to = "admin_level",
values_to = "value"
) |>
dplyr::summarise(
unique_n = dplyr::n_distinct(value, na.rm = TRUE),
.by = c(country, admin_level)
) |>
print_reactable(sorted_col = "country", width = 500)Merge Sponsored Participants from Ireland’s Team
All participants from the Ireland’s sponsored dataset completed the survey and only the mandatory items were included. Some variables had different options than those in the main dataset.
# Merge the sponsored Irish participants
df_irl_raw <-
readr::read_csv("999_irish_sponsored_public.csv", show_col_types = FALSE) |>
dplyr::glimpse(width = 100)Rows: 1,200
Columns: 21
$ utcdateandtime <chr> "04/07/2025 09:30", "04/07/2025 09:35", "04/07/2025 09:36", …
$ participantprivateid <dbl> 13767545, 13767552, 13767549, 13767554, 13767544, 13767547, …
$ branchpbkg <chr> "male", "female", "female", "female", "female", "female", "m…
$ qid12object4response <chr> "Agree", "Strongly Agree", "Strongly Disagree", "Agree", "St…
$ qid13object6response <chr> "Strongly Agree", "Agree", "Agree", "Disagree", "Agree", "Ag…
$ qid14object8response <chr> "Strongly Agree", "Strongly Agree", "Strongly Disagree", "Ag…
$ qid15object9response <chr> "Strongly Agree", "Absolutely Agree", "Agree", "Agree", "Agr…
$ qid16object10response <chr> "Agree", "Strongly Agree", "Agree", "Disagree", "Agree", "Di…
$ qid17object11response <chr> "Strongly Agree", "Absolutely Agree", "Agree", "Agree", "Agr…
$ qid18object12response <chr> "Agree", "Absolutely Agree", "Strongly Agree", "Strongly Agr…
$ qid19object13response <chr> "Agree", "Agree", "Agree", "Strongly Agree", "Strongly Agree…
$ qid20object14response <chr> "Agree", "Strongly Agree", "Agree", "Agree", "Strongly Agree…
$ qid20object15response <chr> "Agree", "Agree", "Neutral", "Disagree", "Agree", "Disagree"…
$ qid29object17response <dbl> 8, 7, 7, 6, 8, 7, 7, 8, 6, 6, 2, 6, 7, 3, 6, 7, 8, 10, 9, 7,…
$ born_locationobject5response <chr> "Ireland", "Ireland", "Ireland", "Ireland", "Ireland", "__ot…
$ educationobject8response <chr> "Leaving Certificate", "Degree", "Master's", "Technical or V…
$ employmentobject9response <chr> "Employed full-time", "Employed full-time", "Seeking Employm…
$ incomeobject12quantised <dbl> 6, 6, 7, 3, 7, 2, 5, 6, 10, 3, 5, 1, 8, 4, 4, 5, 6, 7, 9, 5,…
$ incomeobject12response <chr> "€67,001 - €85,000", "€67,001 - €85,000", "€85,001 - €105,00…
$ P1ageobject377Response <dbl> 54, 24, 46, 23, 46, 49, 70, 37, 51, 34, 56, 31, 29, 48, 24, …
$ numhouseholdobject375Response <dbl> 1, 4, 2, 4, 3, 3, 3, 2, 4, 2, 5, 1, 1, 2, 1, 1, 2, 2, 4, 2, …
df_irl <- df_irl_raw |>
dplyr::transmute(
StartDate_irl = utcdateandtime,
ResponseId = as.character(participantprivateid),
sex_irl = branchpbkg,
mpwb_competence = qid12object4response,
mpwb_emotional_stability = qid13object6response,
mpwb_engagement = qid14object8response,
mpwb_meaning = qid15object9response,
mpwb_optimism = qid16object10response,
mpwb_positive_emotion = qid17object11response,
mpwb_positive_relationships = qid18object12response,
mpwb_resilience = qid19object13response,
mpwb_self_esteem = qid20object14response,
mpwb_vitality = qid20object15response,
life_satisfaction = qid29object17response,
ethnicity_citizenship_irl = born_locationobject5response,
# The education options are slightly different from the version
# used for Ireland non-sponsored participants
education_irl = educationobject8response,
# The employment options are slightly different from the version
# used for Ireland non-sponsored participants
employment_irl = employmentobject9response,
# The income brackets are slightly different from the version
# used for Ireland non-sponsored participants
income_irl = incomeobject12quantised,
household_size = numhouseholdobject375Response,
age = P1ageobject377Response
) |>
dplyr::mutate(
sex_orig = dplyr::case_when(
sex_irl == "male" ~ 1,
sex_irl == "female" ~ 2,
sex_irl == "other" ~ 3,
TRUE ~ NA_integer_
),
ethnicity_citizenship_orig = dplyr::case_when(
# The only options given were "Ireland" and "__other"
ethnicity_citizenship_irl == "Ireland" ~ "10",
ethnicity_citizenship_irl == "__other" ~ "11",
TRUE ~ NA_character_
),
education_orig = dplyr::case_when(
education_irl == "Less than Junior (Inter) Cert" ~ 1,
education_irl == "Junior (Inter) Certificate or Equivalent" ~ 2,
education_irl == "Leaving Certificate" ~ 3,
education_irl == "Technical or Vocational Certificate" ~ 4,
education_irl == "Diploma" ~ 5,
education_irl == "Degree" ~ 6,
education_irl == "Master's" ~ 7,
education_irl == "Doctorate" ~ 8,
TRUE ~ NA_integer_
),
employment_orig = dplyr::case_when(
employment_irl == "Employed full-time" ~ "3",
employment_irl == "Employed part-time" ~ "4",
employment_irl == "Student" ~ "1",
employment_irl == "Seeking Employment/Unemployed" ~ "8",
employment_irl == "Homemaker/Carer" ~ "7",
employment_irl == "Unable to Work" ~ "9",
employment_irl == "Retired" ~ "6",
# The option below is not in the original coding scheme
employment_irl == "Self-employed" ~ NA_character_,
TRUE ~ NA_character_
),
income_orig = dplyr::if_else(
# The option 10 = "Prefer not to say" is recoded to NA
income_irl == 10,
NA_integer_,
income_irl
),
Q_Language = "EN-IRL-sponsored",
UserLanguage = "EN-IRL-sponsored",
iso3 = "IRL",
iso2 = "IE",
country = "Ireland",
loc_resident = 1,
loc_country = "Ireland",
lat = 53.3861632,
long = -10.5940283,
irl = 1
) |>
# We need to recode the MPWB items from text to numerical
dplyr::mutate(
dplyr::across(
dplyr::all_of(mpwb_items),
~ as.numeric(base::factor(
.,
levels = c(
# first level will be coded as 1
"Absolutely Disagree",
# second level will be coded as 2, etc.
"Strongly Disagree",
"Disagree",
"Neutral",
"Agree",
"Strongly Agree",
"Absolutely Agree"
)
)))) |>
dplyr::glimpse(width = 100)Rows: 1,200
Columns: 35
$ StartDate_irl <chr> "04/07/2025 09:30", "04/07/2025 09:35", "04/07/2025 09:36", "0…
$ ResponseId <chr> "13767545", "13767552", "13767549", "13767554", "13767544", "1…
$ sex_irl <chr> "male", "female", "female", "female", "female", "female", "mal…
$ mpwb_competence <dbl> 5, 6, 2, 5, 6, 5, 5, 5, 4, 5, 4, 4, 6, 7, 7, 5, 6, 6, 5, 7, 2,…
$ mpwb_emotional_stability <dbl> 6, 5, 5, 3, 5, 5, 6, 5, 3, 5, 4, 5, 5, 6, 7, 5, 5, 7, 6, 5, 2,…
$ mpwb_engagement <dbl> 6, 6, 2, 5, 5, 5, 6, 3, 5, 5, 5, 5, 5, 7, 7, 6, 5, 4, 5, 4, 2,…
$ mpwb_meaning <dbl> 6, 7, 5, 5, 5, 4, 5, 5, 4, 5, 2, 5, 5, 7, 7, 5, 7, 7, 5, 7, 3,…
$ mpwb_optimism <dbl> 5, 6, 5, 3, 5, 3, 6, 5, 4, 5, 3, 5, 5, 7, 7, 5, 7, 7, 6, 7, 3,…
$ mpwb_positive_emotion <dbl> 6, 7, 5, 5, 5, 4, 6, 5, 4, 5, 3, 5, 5, 5, 6, 5, 6, 7, 7, 5, 3,…
$ mpwb_positive_relationships <dbl> 5, 7, 6, 6, 5, 5, 7, 5, 5, 6, 3, 6, 5, 4, 6, 6, 5, 6, 6, 5, 2,…
$ mpwb_resilience <dbl> 5, 5, 5, 6, 6, 4, 5, 5, 5, 4, 5, 4, 5, 5, 5, 6, 6, 7, 6, 6, 3,…
$ mpwb_self_esteem <dbl> 5, 6, 5, 5, 6, 4, 5, 5, 4, 5, 3, 5, 5, 1, 4, 5, 6, 7, 6, 5, 2,…
$ mpwb_vitality <dbl> 5, 5, 4, 3, 5, 3, 5, 5, 4, 5, 3, 4, 4, 1, 4, 5, 6, 7, 4, 4, 2,…
$ life_satisfaction <dbl> 8, 7, 7, 6, 8, 7, 7, 8, 6, 6, 2, 6, 7, 3, 6, 7, 8, 10, 9, 7, 2…
$ ethnicity_citizenship_irl <chr> "Ireland", "Ireland", "Ireland", "Ireland", "Ireland", "__othe…
$ education_irl <chr> "Leaving Certificate", "Degree", "Master's", "Technical or Voc…
$ employment_irl <chr> "Employed full-time", "Employed full-time", "Seeking Employmen…
$ income_irl <dbl> 6, 6, 7, 3, 7, 2, 5, 6, 10, 3, 5, 1, 8, 4, 4, 5, 6, 7, 9, 5, 1…
$ household_size <dbl> 1, 4, 2, 4, 3, 3, 3, 2, 4, 2, 5, 1, 1, 2, 1, 1, 2, 2, 4, 2, 3,…
$ age <dbl> 54, 24, 46, 23, 46, 49, 70, 37, 51, 34, 56, 31, 29, 48, 24, 34…
$ sex_orig <dbl> 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 2, 2,…
$ ethnicity_citizenship_orig <chr> "10", "10", "10", "10", "10", "11", "10", "10", "10", "10", "1…
$ education_orig <dbl> 3, 6, 7, 4, 6, 6, 7, 4, 5, 6, 7, 3, 7, 7, 7, 6, 3, 7, 6, 3, 6,…
$ employment_orig <chr> "3", "3", "8", "3", "3", "7", "6", "3", "7", "3", "7", "4", "3…
$ income_orig <dbl> 6, 6, 7, 3, 7, 2, 5, 6, NA, 3, 5, 1, 8, 4, 4, 5, 6, 7, 9, 5, 1…
$ Q_Language <chr> "EN-IRL-sponsored", "EN-IRL-sponsored", "EN-IRL-sponsored", "E…
$ UserLanguage <chr> "EN-IRL-sponsored", "EN-IRL-sponsored", "EN-IRL-sponsored", "E…
$ iso3 <chr> "IRL", "IRL", "IRL", "IRL", "IRL", "IRL", "IRL", "IRL", "IRL",…
$ iso2 <chr> "IE", "IE", "IE", "IE", "IE", "IE", "IE", "IE", "IE", "IE", "I…
$ country <chr> "Ireland", "Ireland", "Ireland", "Ireland", "Ireland", "Irelan…
$ loc_resident <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ loc_country <chr> "Ireland", "Ireland", "Ireland", "Ireland", "Ireland", "Irelan…
$ lat <dbl> 53.38616, 53.38616, 53.38616, 53.38616, 53.38616, 53.38616, 53…
$ long <dbl> -10.59403, -10.59403, -10.59403, -10.59403, -10.59403, -10.594…
$ irl <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
# Sanity check: View the counts of each option
base::table(df_irl$mpwb_competence, df_irl_raw$qid12object4response, useNA = "always")
Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
1 0 18 0 0 0 0 0 0
2 0 0 0 0 0 0 22 0
3 0 0 0 59 0 0 0 0
4 0 0 0 0 262 0 0 0
5 0 0 498 0 0 0 0 0
6 0 0 0 0 0 229 0 0
7 112 0 0 0 0 0 0 0
<NA> 0 0 0 0 0 0 0 0
Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
1 0 20 0 0 0 0 0 0
2 0 0 0 0 0 0 39 0
3 0 0 0 148 0 0 0 0
4 0 0 0 0 231 0 0 0
5 0 0 490 0 0 0 0 0
6 0 0 0 0 0 192 0 0
7 80 0 0 0 0 0 0 0
<NA> 0 0 0 0 0 0 0 0
Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
1 0 11 0 0 0 0 0 0
2 0 0 0 0 0 0 18 0
3 0 0 0 111 0 0 0 0
4 0 0 0 0 362 0 0 0
5 0 0 480 0 0 0 0 0
6 0 0 0 0 0 157 0 0
7 61 0 0 0 0 0 0 0
<NA> 0 0 0 0 0 0 0 0
Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
1 0 24 0 0 0 0 0 0
2 0 0 0 0 0 0 26 0
3 0 0 0 82 0 0 0 0
4 0 0 0 0 264 0 0 0
5 0 0 479 0 0 0 0 0
6 0 0 0 0 0 213 0 0
7 112 0 0 0 0 0 0 0
<NA> 0 0 0 0 0 0 0 0
Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
1 0 21 1 8 3 2 4 0
2 0 2 0 15 13 1 8 0
3 0 0 21 36 45 2 4 0
4 4 2 111 20 144 17 2 0
5 15 0 302 7 47 51 1 0
6 26 0 52 0 9 95 0 0
7 66 0 20 0 2 20 1 0
<NA> 0 0 0 0 0 0 0 0
Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
1 1 15 1 2 0 1 5 0
2 0 0 5 2 5 2 6 0
3 0 7 17 22 30 3 7 0
4 4 1 85 40 112 14 7 0
5 20 1 293 14 99 79 1 0
6 25 0 64 2 15 82 0 0
7 62 0 14 0 3 32 0 0
<NA> 0 0 0 0 0 0 0 0
Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
1 0 26 0 0 0 0 0 0
2 0 0 0 0 0 0 22 0
3 0 0 0 84 0 0 0 0
4 0 0 0 0 179 0 0 0
5 0 0 498 0 0 0 0 0
6 0 0 0 0 0 235 0 0
7 156 0 0 0 0 0 0 0
<NA> 0 0 0 0 0 0 0 0
Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
1 0 23 0 0 0 0 0 0
2 0 0 0 0 0 0 34 0
3 0 0 0 145 0 0 0 0
4 0 0 0 0 248 0 0 0
5 0 0 504 0 0 0 0 0
6 0 0 0 0 0 159 0 0
7 87 0 0 0 0 0 0 0
<NA> 0 0 0 0 0 0 0 0
Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
1 0 39 0 0 0 0 0 0
2 0 0 0 0 0 0 34 0
3 0 0 0 110 0 0 0 0
4 0 0 0 0 264 0 0 0
5 0 0 460 0 0 0 0 0
6 0 0 0 0 0 176 0 0
7 117 0 0 0 0 0 0 0
<NA> 0 0 0 0 0 0 0 0
Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
1 0 55 0 0 0 0 0 0
2 0 0 0 0 0 0 87 0
3 0 0 0 247 0 0 0 0
4 0 0 0 0 325 0 0 0
5 0 0 330 0 0 0 0 0
6 0 0 0 0 0 108 0 0
7 48 0 0 0 0 0 0 0
<NA> 0 0 0 0 0 0 0 0
# For the rows that are not in the Irish dataset
df_pub$irl <- 0
# Merge both datasets
df_merged <- dplyr::bind_rows(df_pub, df_irl) |>
dplyr::relocate(StartDate_irl, .after = StartDate) |>
dplyr::relocate(Q_Language, .after = ResponseId) |>
dplyr::relocate(age, .after = birth_year_orig) |>
dplyr::relocate(sex_irl, .after = sex_orig) |>
dplyr::relocate(ethnicity_citizenship_irl, .after = ethnicity_citizenship_orig) |>
dplyr::relocate(employment_irl, .after = employment_orig) |>
dplyr::relocate(education_irl, .after = education_orig) |>
dplyr::relocate(income_irl, .after = income_orig)
# Total sample size before individual exclusion criteria
nrow(df_merged)[1] 69408
# Sanity check:
# Is the sum of rows of both individual datasets equal to the merged dataset?
(length(df_irl$ResponseId) + length(df_pub$ResponseId)) ==
length(df_merged$ResponseId)[1] TRUE
MPWB
# Sanity check: View the counts of each option
for (i in mpwb_items) {
eval(parse(text = sprintf("table_label(df_pub$%s)", i)))
cat("\n")
}$mpwb_positive_relationships
I receive help and support from people I am close to when I need it.
1 2 3 4 5 6 7 <NA>
1751 1955 4594 7323 21636 14217 11715 5017
Class: numeric
$mpwb_meaning
I feel what I do in my life is valuable and worthwhile.
1 2 3 4 5 6 7 <NA>
1966 2448 5393 8627 21037 13474 10271 4992
Class: numeric
$mpwb_competence
I feel a sense of accomplishment from what I do.
1 2 3 4 5 6 7 <NA>
1907 2582 6319 9177 22141 13119 7974 4989
Class: numeric
$mpwb_engagement
I feel absorbed in what I am doing.
1 2 3 4 5 6 7 <NA>
1229 2041 6701 11039 22993 12211 7040 4954
Class: numeric
$mpwb_self_esteem
I feel positive about myself.
1 2 3 4 5 6 7 <NA>
2009 2703 6688 9186 21232 12895 8582 4913
Class: numeric
$mpwb_optimism
I am optimistic about my future.
1 2 3 4 5 6 7 <NA>
2770 3118 6488 10098 19560 11972 9255 4947
Class: numeric
$mpwb_positive_emotion
I feel happy.
1 2 3 4 5 6 7 <NA>
2114 2673 6253 12053 21446 11308 7423 4938
Class: numeric
$mpwb_emotional_stability
I feel calm and peaceful.
1 2 3 4 5 6 7 <NA>
2571 3882 10398 11446 19752 9404 5835 4920
Class: numeric
$mpwb_resilience
I recover quickly from things that go wrong in my life.
1 2 3 4 5 6 7 <NA>
2385 3916 10520 10552 21264 9366 5205 5000
Class: numeric
$mpwb_vitality
I feel full of energy.
1 2 3 4 5 6 7 <NA>
3422 5107 11610 12272 17740 8110 5015 4932
Class: numeric
df_merged <- df_merged |>
dplyr::rowwise() |>
dplyr::mutate(
# Identify participants that completed all MPWB items
mpwb_n = base::sum(!is.na(dplyr::c_across(dplyr::all_of(mpwb_items)))),
# Calculate variance, average and sum score of the MPWB items
# explicitly to only for participants who answered all MPWB items
mpwb_mean = dplyr::if_else(
mpwb_n == 10,
base::mean(dplyr::c_across(dplyr::all_of(mpwb_items))),
NA_real_
),
mpwb_var = dplyr::if_else(
mpwb_n == 10,
stats::var(dplyr::c_across(dplyr::all_of(mpwb_items))),
NA_real_
),
mpwb_sum = dplyr::if_else(
mpwb_n == 10,
base::sum(dplyr::c_across(dplyr::all_of(mpwb_items))),
NA_real_
)
) |>
# remove the rowwise computation
dplyr::ungroup() |>
# organise the variables positions
dplyr::relocate(mpwb_n:mpwb_sum, .after = mpwb_vitality)
# Sanity check: View the new MPWB variables
dplyr::glimpse(df_merged |> dplyr::select(dplyr::starts_with("mpwb_")), width = 100)Rows: 69,408
Columns: 14
$ mpwb_competence <dbl> 6, 5, 5, 5, 5, 5, 5, 7, 5, 5, 4, 7, 5, 5, 5, 4, 6, 6, 5, 5, 7,…
$ mpwb_emotional_stability <dbl> 6, 3, 5, 5, 5, 4, 5, 7, 6, 7, 5, 5, 4, 7, 5, 4, 7, 6, 4, 5, 5,…
$ mpwb_engagement <dbl> 6, 6, 5, 6, 5, 4, 5, 7, 3, 5, 5, 6, 7, 7, 5, 4, 4, 5, 5, 3, 7,…
$ mpwb_meaning <dbl> 6, 3, 5, 6, 4, 4, 6, 7, 5, 6, 4, 5, 4, 6, 5, 4, 7, 5, 4, 5, 7,…
$ mpwb_optimism <dbl> 7, 5, 5, 6, 5, 3, 7, 7, 6, 7, 4, 6, 5, 6, 5, 4, 7, 5, 6, 6, 7,…
$ mpwb_positive_emotion <dbl> 5, 3, 5, 6, 5, 7, 6, 7, 7, 5, 4, 5, 6, 6, 4, 4, 7, 5, 5, 5, 7,…
$ mpwb_positive_relationships <dbl> 5, 5, 5, 4, 7, 7, 6, 7, 5, 7, 5, 6, 5, 6, 5, 4, 6, 5, 4, 3, 7,…
$ mpwb_resilience <dbl> 5, 5, 5, 6, 5, 3, 6, 7, 5, 7, 4, 6, 4, 6, 4, 4, 7, 7, 4, 3, 6,…
$ mpwb_self_esteem <dbl> 6, 5, 5, 7, 3, 4, 7, 7, 6, 5, 4, 7, 5, 6, 5, 4, 6, 6, 6, 6, 7,…
$ mpwb_vitality <dbl> 5, 1, 4, 5, 3, 4, 5, 7, 6, 7, 4, 6, 4, 6, 5, 4, 5, 5, 5, 5, 5,…
$ mpwb_n <int> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10…
$ mpwb_mean <dbl> 5.7, 4.1, 4.9, 5.6, 4.7, 4.5, 5.8, 7.0, 5.4, 6.1, 4.3, 5.9, 4.…
$ mpwb_var <dbl> 0.4555556, 2.3222222, 0.1000000, 0.7111111, 1.3444444, 2.05555…
$ mpwb_sum <dbl> 57, 41, 49, 56, 47, 45, 58, 70, 54, 61, 43, 59, 49, 61, 48, 40…
# Sanity check: Are there missing values in the sum score when mpwb_n is 10?
base::table(df_merged$mpwb_n, is.na(df_merged$mpwb_sum), useNA = "always")
FALSE TRUE <NA>
0 0 3247 0
1 0 722 0
2 0 396 0
3 0 372 0
4 0 275 0
5 0 239 0
6 0 240 0
7 0 193 0
8 0 152 0
9 0 174 0
10 63398 0 0
<NA> 0 0 0
# Sanity check: Are there values in the var score when mpwb_n is not 10?
df_merged |> dplyr::filter(mpwb_n != 10 & (!is.na(mpwb_var))) |> base::nrow()[1] 0
Completion time
df_merged <- df_merged |>
dplyr::rowwise() |>
dplyr::mutate(
# Count how many items were answered (not NA) after the debts item
# (all items up to the debts item were forced-response)
n_items_after = base::sum(!is.na(dplyr::c_across(
c(
followup,
phq_interest,
phq_down,
gad_anxious,
gad_worry,
childhood_SES,
fin_outlook,
fin_outlook_conf,
attention_care,
work_arrangement
)
))),
# Calculate adjusted duration if the mandatory items were completed.
# Some survey versions have different variables of the same item,
# but all versions have 20 mandatory items before debts.
total_items = dplyr::if_else(
!is.na(debts_orig),
20 + n_items_after,
NA_real_),
duration_adj = dplyr::if_else(
!is.na(debts_orig),
duration_sec / total_items,
NA_real_)
) |>
dplyr::ungroup() |>
# organise the variables positions
dplyr::relocate(n_items_after:duration_adj, .after = duration_sec)
# Sanity check: View the new variables
dplyr::glimpse(df_merged |> dplyr::select(duration_sec:duration_adj), width = 100)Rows: 69,408
Columns: 4
$ duration_sec <dbl> 1028, 442, 370, 426, 512, 344, 341, 744, 582, 1006, 233, 173, 6735, 120, 270…
$ n_items_after <int> 1, 10, 1, 10, 9, 10, 10, 1, 10, 9, 1, 10, 1, 10, 10, 10, 1, 10, 9, 1, 9, 10,…
$ total_items <dbl> 21, 30, 21, 30, 29, 30, 30, 21, 30, 29, 21, 30, 21, 30, 30, 30, 21, 30, 29, …
$ duration_adj <dbl> 48.952381, 14.733333, 17.619048, 14.200000, 17.655172, 11.466667, 11.366667,…
# Sanity check: Is there a mismatch between n_items_after and total_items?
base::table(df_merged$n_items_after, df_merged$total_items, useNA = "always")
20 21 24 25 26 27 28 29 30 <NA>
0 1899 0 0 0 0 0 0 0 0 13381
1 0 14144 0 0 0 0 0 0 0 0
4 0 0 6 0 0 0 0 0 0 0
5 0 0 0 44 0 0 0 0 0 0
6 0 0 0 0 80 0 0 0 0 0
7 0 0 0 0 0 1 0 0 0 0
8 0 0 0 0 0 0 1313 0 0 0
9 0 0 0 0 0 0 0 9303 0 0
10 0 0 0 0 0 0 0 0 29237 0
<NA> 0 0 0 0 0 0 0 0 0 0
# Sanity check: Is there unexpected missing values in total_items?
df_merged |>
dplyr::summarise(
all_total_items_missing_when_debts_missing =
all(is.na(total_items[is.na(debts_orig)])),
any_total_items_present_when_debts_missing =
any(!is.na(total_items[is.na(debts_orig)])))# A tibble: 1 × 2
all_total_items_missing_when_debts_missing any_total_items_present_when_debts_missing
<lgl> <lgl>
1 TRUE FALSE
# Sanity check: Is there unexpected missing values in n_items_after?
base::table(df_merged$n_items_after, is.na(df_merged$debts_orig), useNA = "always")
FALSE TRUE <NA>
0 1899 13381 0
1 14144 0 0
4 6 0 0
5 44 0 0
6 80 0 0
7 1 0 0
8 1313 0 0
9 9303 0 0
10 29237 0 0
<NA> 0 0 0
# Sanity check: View the range of duration_adj
df_merged |>
dplyr::filter(!is.na(debts_orig)) |>
dplyr::summarise(
min_duration_adj = min(duration_adj, na.rm = TRUE),
max_duration_adj = max(duration_adj, na.rm = TRUE)
)# A tibble: 1 × 2
min_duration_adj max_duration_adj
<dbl> <dbl>
1 1.7 22309.
# Plot intra-individual variance vs time, faceted by country
ggplot2::ggplot(df_sub, ggplot2::aes(x = duration_adj, y = mpwb_var)) +
ggplot2::geom_point(alpha = 0.25, size = 0.8) +
ggplot2::geom_smooth(formula = y ~ x, method = "loess", se = TRUE) +
ggplot2::facet_wrap(~ country, scales = "free_y", ncol = 4, nrow = 25) +
ggplot2::labs(
x = "Duration adjusted (seconds)",
y = "Within-person variance across MPWB"
) +
ggplot2::theme(
strip.text = ggplot2::element_text(size = 9, face = "bold"),
axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)
)ggplot2::ggplot(df_sub, ggplot2::aes(x = duration_adj, y = mpwb_sum)) +
ggplot2::geom_point(alpha = 0.2, size = 0.8) +
ggplot2::geom_smooth(formula = y ~ x, method = "loess", se = TRUE) +
ggplot2::facet_wrap(~ country, scales = "free", ncol = 4, nrow = 23) +
ggplot2::labs(
x = "Duration adjusted (seconds)",
y = "MPWB Sum"
) +
ggplot2::theme(
strip.text = ggplot2::element_text(size = 9, face = "bold"),
axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)
)PHQ4
The original PHQ-4 has vague verbal anchors that could limit the comparability of results across languages. For example, the option “Several days” could be interpreted as 2-3 days or as more than 7 days in other languages. Since “One week” is not more than 7 days, we decided to recode it as “Several days”.
| Used anchors | Original anchors | Recoded value |
|---|---|---|
| Never (1) | Not at all (0) | 0 |
| Once or twice (1–2) (2) | Several days (1) | 1 |
| A few days (3–4) (3) | Several days (1) | 1 |
| Several days (4) | Several days (1) | 1 |
| One week (5) | Several days (1) | 1 |
| More than a week (6) | More than half the days (2) | 2 |
| Every day / nearly every day(7) | Nearly every day (3) | 3 |
# Sanity check: View the counts of each option
for (i in phq4_items) {
eval(parse(text = sprintf("table_label(df_merged$%s)", i)))
cat("\n")
}$phq_interest
Over the last 2 weeks, how often have you been bothered by the following problems? - Little interest or pleasure in doing things
1 2 3 4 5 6 7 <NA>
6413 13900 8172 3943 1230 1948 4378 29424
Class: numeric
$phq_down
Over the last 2 weeks, how often have you been bothered by the following problems? - Feeling down, depressed or hopeless
1 2 3 4 5 6 7 <NA>
8727 14216 6675 3362 1070 2068 3866 29424
Class: numeric
$gad_anxious
Over the last 2 weeks, how often have you been bothered by the following problems? - Feeling nervous, anxious or on edge
1 2 3 4 5 6 7 <NA>
5755 13322 7569 4542 1275 2303 5218 29424
Class: numeric
$gad_worry
Over the last 2 weeks, how often have you been bothered by the following problems? - Not being able to stop or control worrying
1 2 3 4 5 6 7 <NA>
11900 11965 5229 3206 1240 2142 4302 29424
Class: numeric
# Function to recode PHQ-4 items.
recode_phq <- function(i) {
dplyr::case_when(
i == 1 ~ 0,
i %in% 2:5 ~ 1,
i == 6 ~ 2,
i == 7 ~ 3,
TRUE ~ NA_real_
)
}
# Sanity check: Count missing values in PHQ-4 items when gad_worry is not missing
df_merged |>
dplyr::filter(!is.na(gad_worry)) |>
dplyr::summarise(
dplyr::across(dplyr::all_of(phq4_items), ~ base::sum(is.na(.x))),
n_total = dplyr::n()
)# A tibble: 1 × 5
phq_interest phq_down gad_anxious gad_worry n_total
<int> <int> <int> <int> <int>
1 0 0 0 0 39984
# Apply recoding and compute sum scores
# only for participants who answered all PHQ-4 items
# (i.e., not missing in the last PHQ item)
# gad_worry was the last item in the PHQ-4 matrix
df_merged <- df_merged |>
dplyr::mutate(
# Calculate the sums for phq2, gad2, and phq4
# only for participants who answered all PHQ-4 items
phq2_sum = dplyr::if_else(
!is.na(gad_worry),
phq_down + phq_interest,
NA_real_
),
gad2_sum = dplyr::if_else(
!is.na(gad_worry),
gad_worry + gad_anxious,
NA_real_
),
phq4_sum = phq2_sum + gad2_sum
) |>
dplyr::mutate(
# Apply the recoding function to the individual PHQ items
dplyr::across(all_of(phq4_items), recode_phq, .names = "{.col}_rec"),
# Calculate the sums for recoded phq2, gad2, and phq4
# only for participants who answered all PHQ-4 items
phq2_sum_rec = dplyr::if_else(
!is.na(gad_worry),
phq_down_rec + phq_interest_rec,
NA_real_
),
gad2_sum_rec = dplyr::if_else(
!is.na(gad_worry),
gad_worry_rec + gad_anxious_rec,
NA_real_
),
phq4_sum_rec = phq2_sum_rec + gad2_sum_rec,
# Create a variable with cut-off labels
phq4_cat = dplyr::case_when(
!is.na(phq4_sum_rec) & phq4_sum_rec >= 0 & phq4_sum_rec <= 2 ~ "Normal (0–2)",
!is.na(phq4_sum_rec) & phq4_sum_rec >= 3 & phq4_sum_rec <= 5 ~ "Mild (3–5)",
!is.na(phq4_sum_rec) & phq4_sum_rec >= 6 & phq4_sum_rec <= 8 ~ "Moderate (6–8)",
!is.na(phq4_sum_rec) & phq4_sum_rec >= 9 & phq4_sum_rec <= 12 ~ "Severe (9–12)",
# I expect character values, so NA_character_
TRUE ~ NA_character_
),
# Create variables for depression and anxiety screening,
# using the standard cut-off of 3 on the respective subscales
depression_screen = dplyr::case_when(
is.na(phq2_sum_rec) ~ NA_real_,
phq2_sum_rec >= 3 ~ 1,
TRUE ~ 0
),
anxiety_screen = dplyr::case_when(
is.na(gad2_sum_rec) ~ NA_real_,
gad2_sum_rec >= 3 ~ 1,
TRUE ~ 0
)
) |>
dplyr::relocate(phq2_sum:anxiety_screen, .after = gad_worry)
# Sanity checks (view the new variables)
dplyr::glimpse(
df_merged |>
dplyr::filter(!is.na(gad_worry)) |>
dplyr::select(phq_interest:anxiety_screen),
width = 100
)Rows: 39,984
Columns: 17
$ phq_interest <dbl> 2, 2, 3, 6, 1, 2, 1, 2, 1, 2, 2, 1, 2, 2, 7, 6, 2, 1, 1, 2, 1, 2, 1, 3, …
$ phq_down <dbl> 3, 1, 3, 4, 1, 1, 1, 2, 2, 2, 2, 1, 2, 1, 3, 6, 2, 1, 1, 2, 1, 2, 2, 1, …
$ gad_anxious <dbl> 2, 2, 3, 7, 2, 2, 1, 3, 2, 3, 2, 1, 1, 2, 3, 7, 2, 1, 1, 2, 1, 1, 1, 2, …
$ gad_worry <dbl> 1, 2, 3, 7, 1, 1, 1, 3, 2, 3, 2, 1, 2, 1, 1, 3, 2, 1, 1, 2, 1, 1, 2, 1, …
$ phq2_sum <dbl> 5, 3, 6, 10, 2, 3, 2, 4, 3, 4, 4, 2, 4, 3, 10, 12, 4, 2, 2, 4, 2, 4, 3, …
$ gad2_sum <dbl> 3, 4, 6, 14, 3, 3, 2, 6, 4, 6, 4, 2, 3, 3, 4, 10, 4, 2, 2, 4, 2, 2, 3, 3…
$ phq4_sum <dbl> 8, 7, 12, 24, 5, 6, 4, 10, 7, 10, 8, 4, 7, 6, 14, 22, 8, 4, 4, 8, 4, 6, …
$ phq_interest_rec <dbl> 1, 1, 1, 2, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 3, 2, 1, 0, 0, 1, 0, 1, 0, 1, …
$ phq_down_rec <dbl> 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 2, 1, 0, 0, 1, 0, 1, 1, 0, …
$ gad_anxious_rec <dbl> 1, 1, 1, 3, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 3, 1, 0, 0, 1, 0, 0, 0, 1, …
$ gad_worry_rec <dbl> 0, 1, 1, 3, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, …
$ phq2_sum_rec <dbl> 2, 1, 2, 3, 0, 1, 0, 2, 1, 2, 2, 0, 2, 1, 4, 4, 2, 0, 0, 2, 0, 2, 1, 1, …
$ gad2_sum_rec <dbl> 1, 2, 2, 6, 1, 1, 0, 2, 2, 2, 2, 0, 1, 1, 1, 4, 2, 0, 0, 2, 0, 0, 1, 1, …
$ phq4_sum_rec <dbl> 3, 3, 4, 9, 1, 2, 0, 4, 3, 4, 4, 0, 3, 2, 5, 8, 4, 0, 0, 4, 0, 2, 2, 2, …
$ phq4_cat <chr> "Mild (3–5)", "Mild (3–5)", "Mild (3–5)", "Severe (9–12)", "Normal (0–2)…
$ depression_screen <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
$ anxiety_screen <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
# Sanity check: View the range of the recoded variables
base::table(df_merged$phq2_sum_rec, useNA = "always")
0 1 2 3 4 5 6 <NA>
4044 6610 21207 1987 2789 891 2456 29424
0 1 2 3 4 5 6 <NA>
4500 8337 18360 1861 2634 1130 3162 29424
0 1 2 3 4 5 6 7 8 9 10 11 12 <NA>
1958 2617 4413 6396 13785 1936 2304 1135 1600 794 1046 496 1504 29424
# Sanity check: Is there mismatch missing values between the two screenings?
dplyr::count(df_merged, depression_screen, anxiety_screen, name = "n")# A tibble: 5 × 3
depression_screen anxiety_screen n
<dbl> <dbl> <int>
1 0 0 28651
2 0 1 3210
3 1 0 2546
4 1 1 5577
5 NA NA 29424
Life Satisfaction
Income, Assets, and Debts
The open text field contained a Qualtrics validation that forced participants to answer only with digits [0-9], commas, and periods. However, a small amount of participants managed to enter values beyond this validation (e.g., including percentage signs, letters, or other characters).
For the countries where digits 0-9 are not the default numeric keypad, the translations included instructions requesting that participants use only digits 0-9 (Algeria, Bahrain, Chad, Egypt, Kuwait, Morocco, Oman, Saudi Arabia, UAE, Lebanon, Qatar).
Clean numbers
# Sanity check: View the counts of each option
# Option 10 is "Specify: [open text field]"
table_label(df_merged$income_orig)$income_orig
Please indicate what your total household income was for 2024 (before taxes). You can select an option or indicate a precise value. If you are retired or live off a pension, please indicate the amount your household received during the year in total payments. - Selected Choice
0 1 2 3 4 5 6 7 8 9 10 <NA>
2450 5532 6495 6707 6593 5832 5160 5042 3933 5890 6974 8800
Class: numeric
[1] "character"
[1] "character"
[1] "character"
# Participants were able to write in a open text field their income, assets, and debts.
head(unique(df_merged$income_text_orig), 20) [1] NA "8000" "15000" "7000" "243000" "124000" "12345678" "150000" "700.000" "10000" "50000" "400" "2500"
[14] "1500" "6000" "300000" "3600" "636000" "643" "435"
[1] "5" "2" "20.000" "1000000" "5000" "0,00" "250000" "1,000.00" "50000" "00" "0" "700000"
[13] "20000000" "600000" "70000000" "1" "100,000" "50000000" "7000000" "350000000"
[1] "10000000" "2" "18000" "0" "125000" "0,00" "1,000.00" "20.000,00" "1500" "200000000"
[11] "90000" "150000" "200000" "1300000" "10,000" "10000" "120000" "66000000" "100,000,000" "200"
# View values that end with "," or "."
df_merged |>
dplyr::filter(grepl("[.,]$", income_text_orig)) |>
dplyr::select(ResponseId, income_text_orig) |>
base::nrow();[1] 0
# A tibble: 18 × 2
ResponseId assets_orig
<chr> <chr>
1 R_2ilYHj1poprgCX8 1,00,000,
2 R_2Iaw1PAzIm22N4f 1,500,000.
3 R_7Xai7kgm6ni70up 1000000.
4 R_3Ezenl8l5Vbehqq 650000.
5 R_8EouiGcGN3SO3RO 200000.
6 R_7QEIOPC6sqjQ7jF 300000,
7 R_8NwCae5exBdtR98 600,000.
8 R_3wBLehUhjYbWTbq 100000.
9 R_9d3Tm1Wu2M6gFoh 0,
10 R_2E6m2ErMvEOb0wx 4000.
11 R_6elc9peo8vxbMVH 3000.
12 R_7rYZllIWkzJDJ2X 10.
13 R_9GHpnvrI5tXbopH 350000.
14 R_7sbQ258PDYZf45A 2,000000.
15 R_1Hc7FpY3tW9nsh7 500,000.
16 R_9dhgf8xvk6Ib8LX 100000.
17 R_8eOIl90Z2J6iB6k 5,000,000.
18 R_8CSIx79Mqkq1qaB 600,000,000,
# A tibble: 6 × 2
ResponseId debts_orig
<chr> <chr>
1 R_7QEIOPC6sqjQ7jF 23000,
2 R_1wuhfjwEOnWp9AS 0.
3 R_16SQZLnjugK3f6p 0.
4 R_5bW2dvfC8MaUgLB 5,00.
5 R_8CB0K2YQUfWxGY1 0.
6 R_5xPMFVkMda7RhuS 18000.
# Create function to clean numbers
clean_number <- function(i) {
parse_one <- function(s) {
# Keep NA as NA
if (is.na(s))
return(NA_real_)
# Remove leading/trailing spaces
s <- stringr::str_trim(s)
# first character must be a digit, otherwise NA
if (!stringr::str_detect(s, "^[0-9]"))
return(NA_real_)
# If contains "%" or "x", set to NA
if (stringr::str_detect(s, "%") || stringr::str_detect(s, "[xX]"))
return(NA_real_)
# Handle scientific notation ( if e/E is present)
if (stringr::str_detect(s, "[eE]")) {
s_sci <- s |>
stringr::str_replace_all(",", ".") |>
stringr::str_replace_all("[^0-9eE+\\-\\.]", "")
val <- as.numeric(s_sci)
return(val)
}
# Remove non-numeric characters (except "." and ",")
s <- stringr::str_remove_all(s, "[^0-9,\\.]")
# Allow "0"
if (s == "0")
return(0)
# Place values of 0.0 / 0.00 / 0,0 / 0,00 / 0,000 as 0
if (stringr::str_detect(s, "^0[\\.,]0{1,3}$"))
return(0)
# Otherwise, anything else starting with 0 and longer than 1 char -> NA
# For example: "007", "01", "0.7", "0,7", "0.000", "0,000", "0002"
if (stringr::str_detect(s, "^0") && base::nchar(s) > 1)
return(NA_real_)
# Remove "." or "," at the very end
# For example: "1.000.000." -> "1.000.000"
s <- stringr::str_replace(s,"[,\\.]$","")
# Identify last occurrence of "," or "." as decimal separator
# Some countries use "," as decimal separator and others use "."
m <- stringr::str_match(s, "([,\\.])([0-9]*)$")
if (!is.na(m[1])) {
# Count the number of digits after the last separator
sep <- m[2]
digits_after <- m[3]
len <- base::nchar(digits_after)
if (len >= 3) {
# Thousands separator, remove all separators
# For example: "1.000.000" -> "1000000"
s <- stringr::str_remove_all(s, "[,.]")
} else {
# Decimal, keep only last separator as decimal
# Remove all other separators
# For example: "1.000.000.00" -> "1000000.00"
# "1,000,000,00" -> "1000000,00"
s_wo_last <- stringr::str_sub(s, 1, nchar(s) - len - 1)
s_wo_last <- stringr::str_remove_all(s_wo_last, "[,.]")
# This R session uses "." as decimal separator,
# so we need to convert accordingly
# For example: "1000000,00" -> "1000000.00"
s <- paste0(s_wo_last, ".", digits_after)
}
}
# In R, numerical values have 53 bits of precision (9.0e15),
# so very large numbers that exceed R's numeric limit will be rounded
# to the nearest representable double.
# For example, as.numeric("9999999999999999999") returns 10000000000000002048.
as.numeric(s)
}
vapply(i, parse_one, numeric(1))
}
# Sanity check:
clean_number(c(",1", "0.1", "0,75", "1%", "1000", "1000000,00", "1.000",
"1,00,000", "1.000.000.00", "1.000.000.", "0010", "10x", "7e-1",
"9999999999999999999", "0", "0.0", "0,0", "0.00", "0,00", "07",
"0.7", "0,7", "00", "00,00", "00.00")) ,1 0.1 0,75 1% 1000 1000000,00
NA NA NA NA 1000.0 1000000.0
1.000 1,00,000 1.000.000.00 1.000.000. 0010 10x
1000.0 100000.0 1000000.0 1000000.0 NA NA
7e-1 9999999999999999999 0 0.0 0,0 0.00
0.7 10000000000000002048.0 0.0 0.0 0.0 0.0
0,00 07 0.7 0,7 00 00,00
0.0 NA NA NA NA NA
00.00
NA
# Apply function to the values in open text fields
df_merged <- df_merged |>
dplyr::mutate(
income_text_clean = clean_number(income_text_orig),
assets_clean = clean_number(assets_orig),
debts_clean = clean_number(debts_orig)) |>
dplyr::relocate(income_text_clean, .after = income_text_orig) |>
dplyr::relocate(assets_clean, .after = assets_orig) |>
dplyr::relocate(debts_clean, .after = debts_orig)
# Sanity check: View changes between original and cleaned income text
df_merged |>
dplyr::mutate(
income_text_clean = as.character(income_text_clean),
n_digits_orig = stringr::str_count(income_text_orig, "[0-9]"),
n_digits_clean = stringr::str_count(income_text_clean, "[0-9]")
) |>
dplyr::filter(income_text_clean != income_text_orig) |>
dplyr::select(
ResponseId,
income_text_orig,
income_text_clean,
n_digits_orig,
n_digits_clean
) |>
print_reactable(sorted_col = "income_text_orig", width = 800)# Sanity check: View changes between original and cleaned assets text
df_merged |>
dplyr::mutate(
assets_clean = as.character(assets_clean),
debts_clean = as.character(debts_clean),
n_digits_orig = stringr::str_count(assets_orig, "[0-9]"),
n_digits_clean = stringr::str_count(assets_clean, "[0-9]")
) |>
dplyr::filter(assets_clean != assets_orig) |>
dplyr::select(
ResponseId,
assets_orig,
assets_clean,
n_digits_orig,
n_digits_clean
) |>
print_reactable(sorted_col = "assets_orig", width = 800)# Sanity check: View the new cleaned variables
df_merged |>
dplyr::select(
income_orig,
income_text_orig,
income_text_clean,
assets_orig,
assets_clean,
debts_orig,
debts_clean
) |>
dplyr::glimpse(width = 150)Rows: 69,408
Columns: 7
$ income_orig <dbl> 7, 9, 6, 9, 7, 8, 5, 4, 5, 7, 1, 5, 4, 6, 7, 6, 10, 10, 6, 3, 9, 7, 2, 3, 3, 1, 9, 10, 4, 4, 4, 8, 8, 9, 9, 10, 2, 3, 10, …
$ income_text_orig <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "8000", "15000", NA, NA, NA, NA, NA, NA, NA, NA, NA, "7000…
$ income_text_clean <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 8000, 15000, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7000, NA,…
$ assets_orig <chr> "5", "2", "20.000", "1000000", "5000", "0,00", "250000", "1,000.00", "50000", "00", "0", "1000000", "0", "700000", "200000…
$ assets_clean <dbl> 5, 2, 20000, 1000000, 5000, 0, 250000, 1000, 50000, NA, 0, 1000000, 0, 700000, 20000000, 0, 0, 600000, 70000000, 1, 0, 0, …
$ debts_orig <chr> "10000000", "2", "18000", "0", "125000", "0,00", "0", "1,000.00", "0", "20.000,00", "1500", "0", "0", "0", "200000000", "0…
$ debts_clean <dbl> 10000000, 2, 18000, 0, 125000, 0, 0, 1000, 0, 20000, 1500, 0, 0, 0, 200000000, 0, 90000, 0, 0, 0, 150000, 0, 200000, 0, 0,…
# Sanity check: Count missing values in cleaned variables
dplyr::summarise(df_merged,
n_income_orig_text = sum(!is.na(income_text_orig)),
n_income_text_clean_na = sum(is.na(income_text_clean) & !is.na(income_text_orig)),
n_assets_orig = sum(!is.na(assets_orig)),
n_assets_clean_na = sum(is.na(assets_clean) & !is.na(assets_orig)),
n_debts_orig = sum(!is.na(debts_orig)),
n_debts_clean_na = sum(is.na(debts_clean) & !is.na(debts_orig))
)# A tibble: 1 × 6
n_income_orig_text n_income_text_clean_na n_assets_orig n_assets_clean_na n_debts_orig n_debts_clean_na
<int> <int> <int> <int> <int> <int>
1 6973 0 56550 861 56027 909
# Sanity check: View the new cleaned variables
# View values that contain non-numeric characters besides "." and ","
df_merged |>
dplyr::filter(!stringr::str_detect(income_text_orig, "^[0-9,\\.]+$") &
!is.na(income_orig)) |> select(income_text_orig, income_text_clean) |>
base::nrow();[1] 0
df_merged |>
dplyr::filter(!stringr::str_detect(assets_orig, "^[0-9,\\.]+$") &
!is.na(assets_orig)) |> select(assets_orig, assets_clean);# A tibble: 17 × 2
assets_orig assets_clean
<chr> <dbl>
1 -0 NA
2 +10000 NA
3 40% NA
4 4.5e7 45000000
5 1.78e10 17800000000
6 -0 NA
7 +80.000 NA
8 6.265e9 6265000000
9 -0 NA
10 0x0 NA
11 1.3425e10 13425000000
12 10 % NA
13 10% NA
14 2% NA
15 30% NA
16 0% NA
17 +1000000 NA
df_merged |>
dplyr::filter(!stringr::str_detect(debts_orig, "^[0-9,\\.]+$") &
!is.na(debts_orig)) |> select(debts_orig, debts_clean)# A tibble: 10 × 2
debts_orig debts_clean
<chr> <dbl>
1 7% NA
2 50% NA
3 8.95e8 895000000
4 5% NA
5 , 60000 NA
6 10% NA
7 10% NA
8 20% NA
9 ,0% NA
10 +4000 NA
Add financial country-level values
fin_values <-
readr::read_csv("111_country_variables.csv", show_col_types = FALSE) |>
dplyr::glimpse(width = 100)Rows: 126
Columns: 23
$ country <chr> "Albania", "Algeria", "Angola", "Argentina", "Armenia", "…
$ language <chr> "Albanian", "Arabic", "Portuguese", "Spanish", "Armenian"…
$ UserLanguage <chr> "SQI-ALB", "AR-DZA", "PT-AGO", "ES-ARG", "AM-ARM", "EN-AU…
$ income_period <chr> "monthly", "monthly", "monthly", "monthly", "monthly", "a…
$ income_type <chr> "gross", "gross", "gross", "gross", "gross", "gross", "ne…
$ income_year <dbl> 2024, 2025, 2024, 2024, 2024, 2025, 2024, 2024, 2024, 202…
$ income_currency <chr> "lek", "د.ج", "Kz", "$ (peso)", "ՀՀ դրամ", "AU$", "€", "د…
$ income_currency_position <chr> "left", "right", "left", "right", "left", "right", "left"…
$ income_cutoff_min <dbl> 12000, 10000, 50000, 250000, 15000, 40000, 14508, 200, 20…
$ assets_cutoff_min <dbl> 1000, 10000, 0, 100, 10000, 500, 0, 1000, 1000, 100, 100,…
$ debts_cutoff_min <dbl> 1000, 1000, 10000, 100, 10000, 0, 0, 100, 100, 10, 100, 1…
$ assets_upper_limit <dbl> 40000001, 100000000, NA, 350000000, 50000000, 30000000, 2…
$ debts_upper_limit <dbl> 50000001, 300000000, NA, 350000000, 50000000, 3000000, 10…
$ wages_per_year <dbl> 12, 12, 13, 13, 12, NA, NA, NA, 12, NA, 13, 13, 13, 12, N…
$ inflation2024_factor <dbl> NA, 1.0010, NA, NA, NA, 1.0182, NA, NA, NA, NA, NA, NA, N…
$ one_local_unit_to_USD_conversion <dbl> 0.010738447, 0.007728573, 0.001149628, 0.001093261, 0.002…
$ one_USD_to_local_unit_conversion <dbl> 93.123, 129.390, 869.846, 914.695, 392.730, 1.531, 0.924,…
$ country_region <chr> "Europe & Central Asia", "Middle East, North Africa, Afgh…
$ continent <chr> "Europe", "MENA", "Africa", "South America", "Europe", "O…
$ country_incomegroup <chr> "Upper middle income", "Upper middle income", "Lower midd…
$ soft_launch <chr> "June 2", "June 7", "June 2", "June 2", "June 2", "June 5…
$ target_size <dbl> 300, 600, 600, 600, 300, 600, 300, 300, 300, 1200, 600, 6…
$ comment_country <chr> NA, "Collaborator said that the household income values a…
[1] 69408
df_merged <- df_merged |>
dplyr::left_join(dplyr::select(fin_values, -country), by = "UserLanguage")
# Sanity check: Number of rows should remain the same
nrow(df_merged)[1] 69408
Create categorical variables
# Add categorical variable
df_merged <- df_merged |>
dplyr::mutate(
# Considers all options
income_orig_cat_11 =
dplyr::case_when(
income_orig == 0 ~ "No income",
income_orig == 1 ~ "Second decile",
income_orig == 2 ~ "Third decile",
income_orig == 3 ~ "Fourth decile",
income_orig == 4 ~ "Fifth decile",
income_orig == 5 ~ "Sixth decile",
income_orig == 6 ~ "Seventh decile",
income_orig == 7 ~ "Eighth decile",
income_orig == 8 ~ "Ninth decile",
income_orig == 9 ~ "Tenth decile",
income_orig == 10 ~ "Specify",
TRUE ~ NA_character_
),
# Only considers the first 10 options and gives NA to "Specify"
income_orig_cat_10 =
dplyr::case_when(
income_orig == 0 ~ "No income",
income_orig == 1 ~ "Second decile",
income_orig == 2 ~ "Third decile",
income_orig == 3 ~ "Fourth decile",
income_orig == 4 ~ "Fifth decile",
income_orig == 5 ~ "Sixth decile",
income_orig == 6 ~ "Seventh decile",
income_orig == 7 ~ "Eighth decile",
income_orig == 8 ~ "Ninth decile",
income_orig == 9 ~ "Tenth decile",
TRUE ~ NA_character_
)
) |>
dplyr::relocate(income_orig_cat_11, income_orig_cat_10, .after = income_orig)
# Sanity check: View the mapping distribution of the new income variables
df_merged |> dplyr::count(income_orig, income_orig_cat_11)# A tibble: 12 × 3
income_orig income_orig_cat_11 n
<dbl> <chr> <int>
1 0 No income 2450
2 1 Second decile 5532
3 2 Third decile 6495
4 3 Fourth decile 6707
5 4 Fifth decile 6593
6 5 Sixth decile 5832
7 6 Seventh decile 5160
8 7 Eighth decile 5042
9 8 Ninth decile 3933
10 9 Tenth decile 5890
11 10 Specify 6974
12 NA <NA> 8800
# A tibble: 12 × 3
income_orig income_orig_cat_10 n
<dbl> <chr> <int>
1 0 No income 2450
2 1 Second decile 5532
3 2 Third decile 6495
4 3 Fourth decile 6707
5 4 Fifth decile 6593
6 5 Sixth decile 5832
7 6 Seventh decile 5160
8 7 Eighth decile 5042
9 8 Ninth decile 3933
10 9 Tenth decile 5890
11 10 <NA> 6974
12 NA <NA> 8800
Add income bracket information
Country-specific adjustments were applied for an efficient mapping. For example, due to the phrasing, some countries had overlapping values in the brackets: if the last bracket was “more than 4500” and 4500 was the same as the low point of the previous bracket.
# Load the income bracket information and apply country-specific adjustments.
income_recoded <- base::readRDS("111_income_recoded.rds") |>
dplyr::mutate(
income_lowpoint =
dplyr::case_when(
# Correct Mongolia's income bracket error. Where it reads
# "₮1,700,001 – ₮2,000,00" should be "₮1,700,001 – ₮2,000,000".
# Any reasonable person would be able to spot that,
# if they even noticed it.
UserLanguage %in% c("MN-MNG", "EN-MNG") & income_orig == 7 ~ 1700001,
# Qatar 5th bracket: AR-QAT: [150000-250000]; EN-QAT: [150001-250000]
UserLanguage == "AR-QAT" & income_orig == 5 ~ 150001,
# Correct the third bracket in Morocco because it reads "Around 2,500 dirhams
# per month" in the middle of the deciles.
UserLanguage == "AR-MAR" & income_orig == 2 ~ 1500,
TRUE ~ income_lowpoint
),
income_highpoint =
dplyr::case_when(
UserLanguage %in% c("MN-MNG", "EN-MNG") & income_orig == 7 ~ 2000000,
# Correct the third bracket in Morocco because it reads "Around 2,500 dirhams
# per month" in the middle of the deciles.
UserLanguage == "AR-MAR" & income_orig == 2 ~ 2500,
# Correct Uzbekistan's income brackets so the highpoint of each decile
# matches the lowpoint of the next decile (e.g., coding 14.9 mln as 14999999
UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 3 ~ 4999999,
UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 4 ~ 9999999,
UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 5 ~ 14999999,
UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 6 ~ 19999999,
UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 7 ~ 24999999,
UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 8 ~ 29999999,
# Georgia 2nd bracket: KA-GEO 0-500; EN-GEO 0-550.
UserLanguage == "KA-GEO" & income_orig == 1 ~ 550,
# Kyrgyzstan 8th bracket: KY-KGZ [100000-119000]; RU-KGZ [100000-119999].
UserLanguage == "KY-KGZ" & income_orig == 8 ~ 119999,
# Ar-TCD 3rd bracket overlaps with the 2nd bracket and do not match with
# FR-TCD's 3rd bracket.
UserLanguage == "AR-TCD" & income_orig == 3 ~ 3000000,
TRUE ~ income_highpoint
),
)
# Sanity check: Any country have the different brackets across languages?
# We expect to only have differences between Ireland's sponsored and main versions.
income_recoded |>
dplyr::group_by(country, income_orig) |>
dplyr::summarise(
n_lang = dplyr::n_distinct(UserLanguage),
n_brackets = dplyr::n_distinct(
paste(income_lowpoint, income_highpoint)
),
bracket_defs = paste0(
UserLanguage, ": [", income_lowpoint, "-", income_highpoint, "]",
collapse = "; "
),
.groups = "drop"
) |> dplyr::filter(n_lang > 1, n_brackets > 1)# A tibble: 9 × 5
country income_orig n_lang n_brackets bracket_defs
<chr> <int> <int> <int> <chr>
1 Ireland 1 2 2 EN-IRL: [0-17500]; EN-IRL-sponsored: [0-22000]
2 Ireland 2 2 2 EN-IRL: [17500-24999]; EN-IRL-sponsored: [22001-32000]
3 Ireland 3 2 2 EN-IRL: [25000-34999]; EN-IRL-sponsored: [32001-42000]
4 Ireland 4 2 2 EN-IRL: [35000-49999]; EN-IRL-sponsored: [42001-55000]
5 Ireland 5 2 2 EN-IRL: [50000-74999]; EN-IRL-sponsored: [55001-67000]
6 Ireland 6 2 2 EN-IRL: [75000-99999]; EN-IRL-sponsored: [67001-85000]
7 Ireland 7 2 2 EN-IRL: [100000-149999]; EN-IRL-sponsored: [85001-105000]
8 Ireland 8 2 2 EN-IRL: [150000-200000]; EN-IRL-sponsored: [105001-137000]
9 Ireland 9 2 2 EN-IRL: [200000-NA]; EN-IRL-sponsored: [137000-NA]
# Correct gaps between brackets
income_gaps <- income_recoded |>
dplyr::group_by(UserLanguage) |>
dplyr::arrange(income_orig, .by_group = TRUE) |>
# First check lowpoints
dplyr::mutate(
prev_high = dplyr::lag(income_highpoint),
expected_low = prev_high + 1,
has_gap = income_orig >= 2 &
income_orig <= 8 &
!is.na(prev_high) &
!is.na(income_lowpoint) &
income_lowpoint != expected_low,
income_lowpoint_adj = dplyr::if_else(
has_gap,
expected_low,
income_lowpoint
),
# Then highpoints
next_low = dplyr::lead(income_lowpoint_adj),
expected_high = next_low - 1L,
high_needs_fix =
income_orig >= 2 &
income_orig <= 8 &
!is.na(next_low) &
!is.na(income_highpoint) &
income_highpoint != expected_high,
income_highpoint_adj = dplyr::if_else(
high_needs_fix,
expected_high,
income_highpoint
)
) |>
dplyr::ungroup()
# Sanity check: View languages where there is a gap
income_gaps |>
dplyr::filter(has_gap) |>
dplyr::select(
UserLanguage,
income_orig,
prev_high,
income_lowpoint,
expected_low
) |>
print_reactable(sorted_col = "UserLanguage", width = 800)# Transform income_recoded into a wider format for merging
income_info <- income_gaps |>
dplyr::select(UserLanguage, income_orig,
income_lowpoint, income_lowpoint_adj,
income_highpoint, income_highpoint_adj) |>
tidyr::pivot_longer(
cols = c(income_lowpoint, income_lowpoint_adj,
income_highpoint, income_highpoint_adj),
names_to = "bound",
values_to = "value"
) |>
tidyr::pivot_wider(
names_from = c(bound, income_orig),
values_from = value,
names_sep = "_"
) |> dplyr::glimpse(width = 100)Rows: 125
Columns: 37
$ UserLanguage <chr> "AM-ARM", "AM-ETH", "AR-ARE", "AR-BHR", "AR-DZA", "AR-EGY", "AR-KWT…
$ income_lowpoint_1 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ income_lowpoint_adj_1 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ income_highpoint_1 <dbl> 24000, 600, 60000, 300, 15000, 70000, 500, 18000000, 1500, 499, 500…
$ income_highpoint_adj_1 <dbl> 24000, 600, 60000, 300, 15000, 70000, 500, 18000000, 1500, 499, 500…
$ income_lowpoint_2 <dbl> 24001, 601, 60000, 301, 15000, 70001, 500, 18000000, 1500, 500, 500…
$ income_lowpoint_adj_2 <dbl> 24001, 601, 60001, 301, 15001, 70001, 501, 18000001, 1501, 500, 500…
$ income_highpoint_2 <dbl> 48000, 1200, 119999, 600, 24999, 200000, 999, 30000000, 2500, 999, …
$ income_highpoint_adj_2 <dbl> 48000, 1200, 119999, 600, 24999, 200000, 999, 30000000, 2500, 999, …
$ income_lowpoint_3 <dbl> 48001, 1201, 120000, 601, 25000, 200001, 1000, 30000000, 2500, 1000…
$ income_lowpoint_adj_3 <dbl> 48001, 1201, 120000, 601, 25000, 200001, 1000, 30000001, 2501, 1000…
$ income_highpoint_3 <dbl> 120000, 1800, 179999, 900, 34999, 400000, 1499, 60000000, 4000, 149…
$ income_highpoint_adj_3 <dbl> 120000, 1800, 179999, 900, 34999, 400000, 1499, 60000000, 4000, 149…
$ income_lowpoint_4 <dbl> 120001, 1801, 180000, 901, 35000, 400001, 1500, 60000000, 4000, 150…
$ income_lowpoint_adj_4 <dbl> 120001, 1801, 180000, 901, 35000, 400001, 1500, 60000001, 4001, 150…
$ income_highpoint_4 <dbl> 192000, 2400, 239999, 1200, 49999, 600000, 1999, 90000000, 6000, 19…
$ income_highpoint_adj_4 <dbl> 192000, 2400, 239999, 1200, 49999, 600000, 1999, 90000000, 6000, 19…
$ income_lowpoint_5 <dbl> 192000, 2401, 240000, 1201, 50000, 600001, 2000, 90000000, 6000, 20…
$ income_lowpoint_adj_5 <dbl> 192001, 2401, 240000, 1201, 50000, 600001, 2000, 90000001, 6001, 20…
$ income_highpoint_5 <dbl> 383000, 3000, 319999, 1500, 74999, 800000, 2999, 120000000, 8000, 2…
$ income_highpoint_adj_5 <dbl> 383000, 3000, 319999, 1500, 74999, 800000, 2999, 120000000, 8000, 2…
$ income_lowpoint_6 <dbl> 383001, 3001, 320000, 1501, 75000, 800001, 3000, 120000000, 8000, 2…
$ income_lowpoint_adj_6 <dbl> 383001, 3001, 320000, 1501, 75000, 800001, 3000, 120000001, 8001, 2…
$ income_highpoint_6 <dbl> 575000, 5000, 399999, 1800, 99999, 1200000, 3999, 150000000, 10000,…
$ income_highpoint_adj_6 <dbl> 575000, 5000, 399999, 1800, 99999, 1200000, 3999, 150000000, 10000,…
$ income_lowpoint_7 <dbl> 575001, 5001, 400000, 1801, 100000, 1200001, 4000, 150000000, 10000…
$ income_lowpoint_adj_7 <dbl> 575001, 5001, 400000, 1801, 100000, 1200001, 4000, 150000001, 10001…
$ income_highpoint_7 <dbl> 960000, 10000, 499999, 2000, 149999, 2400000, 4999, 200000000, 1250…
$ income_highpoint_adj_7 <dbl> 960000, 10000, 499999, 2000, 149999, 2400000, 4999, 200000000, 1250…
$ income_lowpoint_8 <dbl> 960000, 10001, 500000, 2001, 150000, 2400001, 5000, 200000000, 1250…
$ income_lowpoint_adj_8 <dbl> 960001, 10001, 500000, 2001, 150000, 2400001, 5000, 200000001, 1250…
$ income_highpoint_8 <dbl> 1200000, 20000, 699999, 2300, 200000, 4800000, 6000, 300000000, 150…
$ income_highpoint_adj_8 <dbl> 1200000, 19999, 699999, 2299, 199999, 4799999, 5999, 299999999, 149…
$ income_lowpoint_9 <dbl> 1200001, 20000, 700000, 2300, 200000, 4800000, 6000, 300000000, 150…
$ income_lowpoint_adj_9 <dbl> 1200001, 20000, 700000, 2300, 200000, 4800000, 6000, 300000000, 150…
$ income_highpoint_9 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ income_highpoint_adj_9 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
[1] 69408
df_merged <- df_merged |>
dplyr::left_join(income_info, by = "UserLanguage") |>
dplyr::relocate(income_lowpoint_1:income_highpoint_adj_9,
.after = income_orig_cat_10
)
# Sanity check
nrow(df_merged)[1] 69408
Identify strange numbers in income, assets, and debts
# Create function to identify strange numbers.
weird_nr <- function(i) {
# Temporary transform into a character vector so we can use stringr functions
s <- as.character(i)
# Flag numbers with the same non-zero digit repeated >=4 (e.g., 1111, 9999)
# except for zeros.
rep4 <-
stringr::str_detect(s, "(?:1111|2222|3333|4444|5555|6666|7777|8888|9999)")
# Flag sequential numbers of length >= 3 ascending or descending
# (e.g., 123, 1234, 4321)
asc3 <- stringr::str_detect(s, "(?:123|234|345|456|567|678|789)")
desc3 <- stringr::str_detect(s, "(?:321|432|543|654|765|876|987)")
# Flag repeated 2-digit blocks (e.g., 3939, 1212, 4545)
repeat2 <- stringr::str_detect(s, "(?!0{2})(\\d{2})\\1+")
# Combine all flags and check if any is TRUE
outcome <- (rep4 | asc3 | desc3 | repeat2)
# Make NAs as not weird
outcome[is.na(outcome)] <- FALSE
outcome
}
# Sanity check:
weird_nr(c(999999, 12340, 43210, 3939, 540000, 75000, NA))[1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE
# Apply function to financial variables
df_merged <- df_merged |>
dplyr::mutate(
income_wrd = weird_nr(income_text_clean) |
# Also detect rows where original text exists but cleaning is NA
(!is.na(income_text_orig) & is.na(income_text_clean)),
assets_wrd = weird_nr(assets_clean) |
(!is.na(assets_orig) & is.na(assets_clean)),
debts_wrd = weird_nr(debts_clean) |
(!is.na(debts_orig) & is.na(debts_clean))
) |>
relocate(income_wrd, .after = income_text_clean) |>
relocate(assets_wrd, .after = assets_clean) |>
relocate(debts_wrd, .after = debts_clean)
# Sanity check: View the counts of weird numbers per variable
base::table(df_merged$income_wrd, useNA = "always")
FALSE TRUE <NA>
69353 55 0
FALSE TRUE <NA>
68386 1022 0
FALSE TRUE <NA>
68411 997 0
# Sanity check: View changes between original and cleaned income text
df_merged |>
dplyr::mutate(
income_text_clean = as.character(income_text_clean),
n_digits_orig = stringr::str_count(income_text_orig, "[0-9]"),
n_digits_clean = stringr::str_count(income_text_clean, "[0-9]")
) |>
dplyr::filter(
income_text_clean != income_text_orig |
(!is.na(income_text_orig) & is.na(income_text_clean))) |>
dplyr::group_by(
income_text_orig,
income_text_clean,
income_wrd,
n_digits_orig,
n_digits_clean
) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "income_text_orig", width = 800)# Sanity check: View changes between original and cleaned assets text
df_merged |>
dplyr::mutate(
assets_clean = as.character(assets_clean),
debts_clean = as.character(debts_clean),
n_digits_orig = stringr::str_count(assets_orig, "[0-9]"),
n_digits_clean = stringr::str_count(assets_clean, "[0-9]")
) |>
dplyr::filter(
assets_clean != assets_orig | (!is.na(assets_orig) & is.na(assets_clean))) |>
dplyr::group_by(
assets_orig,
assets_clean,
assets_wrd,
n_digits_orig,
n_digits_clean
) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "assets_orig", width = 800)# Sanity check: View changes between original and cleaned debts text
df_merged |>
dplyr::mutate(
debts_clean = as.character(debts_clean),
n_digits_orig = stringr::str_count(debts_orig, "[0-9]"),
n_digits_clean = stringr::str_count(debts_clean, "[0-9]")
) |>
dplyr::filter(
debts_clean != debts_orig | (!is.na(debts_orig) & is.na(debts_clean)
)) |>
dplyr::group_by(
debts_orig,
debts_clean,
debts_wrd,
n_digits_orig,
n_digits_clean
) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "debts_orig", width = 500)# Sanity check: Count missing values in cleaned variables
dplyr::summarise(df_merged,
n_income_orig_text = sum(!is.na(income_text_orig)),
n_income_text_clean_na = sum(is.na(income_text_clean) & !is.na(income_text_orig)),
n_assets_orig = sum(!is.na(assets_orig)),
n_assets_clean_na = sum(is.na(assets_clean) & !is.na(assets_orig)),
n_debts_orig = sum(!is.na(debts_orig)),
n_debts_clean_na = sum(is.na(debts_clean) & !is.na(debts_orig))
)# A tibble: 1 × 6
n_income_orig_text n_income_text_clean_na n_assets_orig n_assets_clean_na n_debts_orig n_debts_clean_na
<int> <int> <int> <int> <int> <int>
1 6973 0 56550 861 56027 909
# Sanity check: View the rows with NA in cleaned values
# but original text exists
df_merged |> dplyr::group_by(income_text_orig, income_text_clean, income_wrd) |>
dplyr::filter(!is.na(income_text_orig) & is.na(income_text_clean)) |>
dplyr::summarise(n = dplyr::n()) |> base::nrow()[1] 0
df_merged |> dplyr::group_by(assets_orig, assets_clean, assets_wrd) |>
dplyr::filter(!is.na(assets_orig) & is.na(assets_clean)) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "assets_orig", width = 500)df_merged |> dplyr::group_by(debts_orig, debts_clean, debts_wrd) |>
dplyr::filter(!is.na(debts_orig) & is.na(debts_clean)) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "debts_orig", width = 500)Household Size
$household_size
How many people in your household are covered by these finances? Put 1 if you live alone, or if you live with others (e.g., roommates) but are financially independent from them and vice-versa, put 1. Otherwise, list the total number of people living with you that are part of household finances (both incomes and expenses).
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 <NA>
15591 15309 10311 9886 4984 2109 948 596 243 312 71 73 31 29 48 21 8 16 17 115 8690
Class: numeric
df_merged <- df_merged |>
dplyr::mutate(
household_size_group = factor(
dplyr::case_when(
household_size == 1 ~ "1",
household_size == 2 ~ "2",
household_size == 3 ~ "3",
household_size %in% c(4, 5) ~ "4-5",
household_size >= 6 ~ "6-20",
TRUE ~ NA_character_
),
levels = c("1", "2", "3", "4-5", "6-20"),
ordered = TRUE
)
) |>
dplyr::relocate(household_size_group, .after = household_size)
# Sanity check: View the mapping distribution of the new household size variable
base::table(df_merged$household_size, df_merged$household_size_group, useNA = "always")
1 2 3 4-5 6-20 <NA>
1 15591 0 0 0 0 0
2 0 15309 0 0 0 0
3 0 0 10311 0 0 0
4 0 0 0 9886 0 0
5 0 0 0 4984 0 0
6 0 0 0 0 2109 0
7 0 0 0 0 948 0
8 0 0 0 0 596 0
9 0 0 0 0 243 0
10 0 0 0 0 312 0
11 0 0 0 0 71 0
12 0 0 0 0 73 0
13 0 0 0 0 31 0
14 0 0 0 0 29 0
15 0 0 0 0 48 0
16 0 0 0 0 21 0
17 0 0 0 0 8 0
18 0 0 0 0 16 0
19 0 0 0 0 17 0
20 0 0 0 0 115 0
<NA> 0 0 0 0 0 8690
Birth Year and Age
Participants were able to write in an open text field their birth year, and the validation required values between 1925 and 2007, except for Iran (FA-IRN), where the validation ranged from 1304 to 1386.
[1] "character"
[1] "numeric"
# Sanity check: View values with non-numeric characters
df_merged |>
dplyr::filter(!is.na(birth_year_orig) & grepl("\\D", birth_year_orig)) |>
dplyr::select(birth_year_orig) |>
dplyr::distinct() |>
base::print(n = Inf)# A tibble: 41 × 1
birth_year_orig
<chr>
1 2001.
2 1993.
3 2000.
4 2003.
5 2002.
6 1995.
7 2005.
8 1997.
9 1969.
10 1977.
11 2005,
12 1985.
13 1982.
14 1983.
15 2004.
16 1996.
17 1980.
18 1994.
19 2007.
20 1992.
21 2006.
22 1975.
23 1972.
24 1978.
25 1999.
26 1990.
27 1989.
28 1945.
29 1981.
30 1971.
31 1974.
32 1955.
33 1949.
34 1959.
35 1988.
36 ,1979
37 1982.0424
38 ,1953
39 1973.01
40 1963.
41 1951.
# Create cleaned column and keep original.
# Calculate age.
df_merged <- df_merged |>
dplyr::mutate(
# extract first 4-digit sequence and transform to numerical
birth_year_clean =
as.numeric(stringr::str_extract(birth_year_orig, "\\d{4}")),
age = dplyr::case_when(
# Keep the values of the participants from the Irish sponsored dataset
!is.na(age) ~ age,
# If rows in birth year contains NA, then keep NA
is.na(birth_year_clean) & is.na(age) ~ NA_real_,
# If Q_Language is "FA-IRN",
# then use the Solar Hijri calendar (1404)
UserLanguage == "FA-IRN" & !is.na(birth_year_clean) ~ 1404 - birth_year_clean,
# Otherwise, use the Gregorian calendar (2025)
!is.na(birth_year_clean) ~ 2025 - birth_year_clean,
TRUE ~ NA_real_
),
# Create age groups
age_group = base::factor(dplyr::case_when(
age >= 18 & age <= 25 ~ "18-25",
age >= 26 & age <= 44 ~ "26-44",
age >= 45 & age <= 64 ~ "45-64",
age >= 65 & age <= 74 ~ "65-74",
age >= 75 ~ "75+",
TRUE ~ NA_character_
),
levels = c(
"18-25",
"26-44",
"45-64",
"65-74",
"75+"
))
) |>
dplyr::relocate(birth_year_clean:age_group, .after = birth_year_orig)
# Sanity check: View the summary of the cleaned birth year
cat(
"Min: ",
min(df_merged$birth_year_clean, na.rm = TRUE),
"\nMax: ",
max(df_merged$birth_year_clean, na.rm = TRUE),
"\nNA count: ",
sum(is.na(df_merged$birth_year_clean)),
"\nClass: ",
class(df_merged$birth_year_clean)
)Min: 1328
Max: 2007
NA count: 10380
Class: numeric
# Sanity check: Are there rows where raw birth year exists but cleaning failed?
df_merged |>
dplyr::filter(!is.na(birth_year_orig) & is.na(birth_year_clean)) |>
base::nrow()[1] 0
# Sanity check: View the summary of the age variable
cat(
"Min: ",
min(df_merged$age, na.rm = TRUE),
"\nMax: ",
max(df_merged$age, na.rm = TRUE),
"\nNA count: ",
sum(is.na(df_merged$age)),
"\nClass: ",
class(df_merged$age)
)Min: 18
Max: 100
NA count: 9180
Class: numeric
# Sanity check: View the mapping distribution of the new age group variable
base::table(df_merged$age_group, useNA = "ifany")
18-25 26-44 45-64 65-74 75+ <NA>
13674 30944 13098 1988 524 9180
# Sanity check: Are there rows where raw value exists but age group is missing?
df_merged |>
dplyr::filter(!is.na(birth_year_orig) & is.na(age_group)) |>
base::nrow()[1] 0
# Sanity check: View the new birth year and age variables
dplyr::glimpse(df_merged |>
dplyr::select(birth_year_orig,
birth_year_clean,
age,
age_group),
width = 100)Rows: 69,408
Columns: 4
$ birth_year_orig <chr> "1989", "1984", "1971", "1986", "1993", "2005", "1986", "1975", "1995", "…
$ birth_year_clean <dbl> 1989, 1984, 1971, 1986, 1993, 2005, 1986, 1975, 1995, 1963, 1993, 1981, 2…
$ age <dbl> 36, 41, 54, 39, 32, 20, 39, 50, 30, 62, 32, 44, 24, 35, 31, 24, 31, 39, 5…
$ age_group <fct> 26-44, 26-44, 45-64, 26-44, 26-44, 18-25, 26-44, 45-64, 26-44, 45-64, 26-…
# Sanity check: View counts of the sponsored Irish dataset
df_merged |>
dplyr::filter(irl==1) |>
dplyr::group_by(UserLanguage, birth_year_orig, birth_year_clean, age, age_group) |>
dplyr::summarise(n = dplyr::n())# A tibble: 67 × 6
# Groups: UserLanguage, birth_year_orig, birth_year_clean, age [67]
UserLanguage birth_year_orig birth_year_clean age age_group n
<chr> <chr> <dbl> <dbl> <fct> <int>
1 EN-IRL-sponsored <NA> NA 18 18-25 7
2 EN-IRL-sponsored <NA> NA 19 18-25 4
3 EN-IRL-sponsored <NA> NA 20 18-25 12
4 EN-IRL-sponsored <NA> NA 21 18-25 9
5 EN-IRL-sponsored <NA> NA 22 18-25 5
6 EN-IRL-sponsored <NA> NA 23 18-25 9
7 EN-IRL-sponsored <NA> NA 24 18-25 10
8 EN-IRL-sponsored <NA> NA 25 18-25 17
9 EN-IRL-sponsored <NA> NA 26 26-44 11
10 EN-IRL-sponsored <NA> NA 27 26-44 16
# ℹ 57 more rows
# Sanity check: View counts of Iran dataset
df_merged |>
dplyr::filter(UserLanguage == "FA-IRN") |>
dplyr::group_by(UserLanguage, birth_year_orig, birth_year_clean, age, age_group) |>
dplyr::summarise(n = dplyr::n()) |>
dplyr::arrange(-birth_year_clean) |>
print_reactable(sorted_col = "birth_year_clean", width = 800)# Sanity check: View counts of main dataset
df_merged |>
dplyr::filter(irl == 0 & UserLanguage != "FA-IRN") |>
dplyr::group_by(birth_year_orig, birth_year_clean, age, age_group) |>
dplyr::summarise(n = dplyr::n()) |>
dplyr::arrange(-birth_year_clean) |>
print_reactable(sorted_col = "birth_year_clean", width = 800)Sex
Upon collaborators’ request, the option “I prefer to use: [open text field]” was hidden from the survey versions in Kuwait (AR-KWT; EN-KWT), Egypt (AR-EGY; EN-EGY), Yemen (AR-YEM; EN-YEM), in Algeria (AR-DZA), in Saudi Arabia (AR-SAU), Chad (AR-TCD; FR-TCD), and Bahrain (AR-BHR; EN-BHR).
$sex_orig
Which best describes you? - Selected Choice
1 2 3 <NA>
23444 36194 549 9221
Class: numeric
# Load recoded values regarding sex because
# some participants wrote "Female" or "Male" in the open text field
sex_recoded <-
readr::read_csv("111_sex_open_answers_recoded.csv", show_col_types = FALSE) |>
dplyr::glimpse(width = 100)Rows: 544
Columns: 2
$ ResponseId <chr> "R_42tedcZhWdJn9Sk", "R_9hnp095IY8LIkSX", "R_2gvEljyDLXs1Yjm", "R_516n1yU…
$ sex_text_recoded <chr> "Cannot determine", "Cannot determine", "Cannot determine", "Cannot deter…
Cannot determine Female Male Non-binary Other <NA>
137 12 17 344 34 0
df_merged <- df_merged |>
dplyr::left_join(sex_recoded, by = "ResponseId") |>
# create a reviewed numeric coding (1 = Male, 2 = Female, 3 = Other)
dplyr::mutate(
sex_reviewed = dplyr::case_when(
sex_text_recoded == "Female" ~ 2,
sex_text_recoded == "Male" ~ 1,
sex_text_recoded %in% c("Other", "Non-binary") ~ 3,
sex_text_recoded == "Cannot determine" ~ NA_real_,
TRUE ~ sex_orig
),
# categorical factor with explicit levels
sex_reviewed_cat = factor(
dplyr::case_when(
sex_reviewed == 1 ~ "Male",
sex_reviewed == 2 ~ "Female",
sex_reviewed == 3 ~ "Other",
TRUE ~ NA_character_
),
levels = c("Male", "Female", "Other")
),
# binary numeric: 1 = Male, 0 = Female, NA otherwise
sex_binary = dplyr::case_when(
sex_reviewed == 1 ~ 1,
sex_reviewed == 2 ~ 0,
TRUE ~ NA_real_
),
# binary factor
sex_binary_cat = factor(
dplyr::case_when(
sex_binary == 1 ~ "Male",
sex_binary == 0 ~ "Female",
TRUE ~ NA_character_
),
levels = c("Male", "Female")
)
) |>
dplyr::relocate(sex_text_recoded:sex_binary_cat, .after = sex_orig)
# Sanity check: Cross-tabs to inspect recoded text vs numeric reviewed code
df_merged |>
dplyr::group_by(sex_reviewed, sex_reviewed_cat, sex_binary, sex_binary_cat) |>
dplyr::summarise(n = dplyr::n(), .groups = "drop")# A tibble: 4 × 5
sex_reviewed sex_reviewed_cat sex_binary sex_binary_cat n
<dbl> <fct> <dbl> <fct> <int>
1 1 Male 1 Male 23461
2 2 Female 0 Female 36206
3 3 Other NA <NA> 383
4 NA <NA> NA <NA> 9358
# Sanity check: Cross-tabs to inspect original values vs numeric reviewed code
table(df_merged$sex_orig, df_merged$sex_reviewed, useNA = "always")
1 2 3 <NA>
1 23444 0 0 0
2 0 36194 0 0
3 17 12 383 137
<NA> 0 0 0 9221
# Sanity check: View the counts of each option
df_merged |>
dplyr::group_by(sex_reviewed, sex_reviewed_cat, sex_binary, sex_binary_cat) |>
dplyr::summarise(n = dplyr::n())# A tibble: 4 × 5
# Groups: sex_reviewed, sex_reviewed_cat, sex_binary [4]
sex_reviewed sex_reviewed_cat sex_binary sex_binary_cat n
<dbl> <fct> <dbl> <fct> <int>
1 1 Male 1 Male 23461
2 2 Female 0 Female 36206
3 3 Other NA <NA> 383
4 NA <NA> NA <NA> 9358
Education Level
The translated education categories of each country will be mapped to a common set of categories. Some countries had a different definition of secondary education, so the mapping will consider if the level is eligible for university entrance or not. The classification of the education levels in each country was agreed upon with the collaborators. The recoded education categories are:
- Less than secondary (not eligible for university entrance)
- Secondary (completed the equivalent to high school, and it is eligible for university entrance)
- Technical (not higher education)
- University (higher education up to a bachelor’s degree)
- Advanced (anything beyond a bachelor’s degree)
Note:
- The team from Ethiopia (AM-ETH and EN-ETH) requested to hide option 7 from their versions of the survey.
- The team from Peru requested to include an option for “Inclusive Education”. Since this applies across several levels, this option was recoded to NA.
$education_orig
Which is the highest level of education you have completed?
1 2 3 4 5 6 7 8 <NA>
328 1084 8515 8206 19982 13762 4459 3773 9299
Class: numeric
# Load the education categories for each country
edu_cat <-
readr::read_csv("111_education_recoded.csv", show_col_types = FALSE) |>
dplyr::glimpse(width = 100)Rows: 1,024
Columns: 5
$ UserLanguage <chr> "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM"…
$ education_orig <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7,…
$ education_cat <chr> "No primary education", "Primary (Elementary/Middle School)", "High …
$ education_recoded_cat <chr> "Less than secondary", "Less than secondary", "Secondary", "Technica…
$ education_recoded <dbl> 1, 1, 2, 3, 4, 4, 5, 5, 1, 1, 2, 3, 4, 5, NA, 5, 1, 1, 2, 3, 4, 5, 5…
# Sanity check: View if there are unexpected values in education_orig
base::table(edu_cat$education_orig, useNA = "always")
1 2 3 4 5 6 7 8 <NA>
128 128 128 128 128 128 128 128 0
# Sanity check: View if the categories match the expected values
# We expect three cells with missing values regarding
# Peru's inclusive education level, and level 7 was hidden for Ethiopia (AM-ETH
# and EN-ETH).
edu_cat |>
dplyr::group_by(education_recoded_cat, education_recoded) |>
dplyr::summarise(n = dplyr::n()) |>
dplyr::arrange(education_recoded)# A tibble: 6 × 3
# Groups: education_recoded_cat [6]
education_recoded_cat education_recoded n
<chr> <dbl> <int>
1 Less than secondary 1 275
2 Secondary 2 138
3 Technical 3 119
4 University 4 129
5 Advanced 5 360
6 <NA> NA 3
# A tibble: 3 × 5
UserLanguage education_orig education_cat education_recoded_cat education_recoded
<chr> <dbl> <chr> <chr> <dbl>
1 AM-ETH 7 <NA> <NA> NA
2 EN-ETH 7 <NA> <NA> NA
3 ES-PER 5 Inclusive education <NA> NA
# Add the education categories to the main data frame
df_merged <- df_merged |>
dplyr::left_join(
edu_cat |> dplyr::select(
UserLanguage,
education_orig,
education_cat,
education_recoded_cat,
education_recoded
),
by = c("UserLanguage", "education_orig")
) |>
dplyr::mutate(
education_recoded_cat = base::factor(
education_recoded_cat,
levels = c(
"Less than secondary",
"Secondary",
"Technical",
"University",
"Advanced"
),
ordered = TRUE
)
) |>
dplyr::relocate(education_cat:education_recoded, .after = education_orig)
# Sanity check: Are there education values without a corresponding
# education_recoded and education_recoded_cat?
df_merged |>
group_by(
UserLanguage,
education_orig,
education_cat,
education_recoded_cat,
education_recoded
) |>
dplyr::summarise(n = dplyr::n()) |>
dplyr::filter(is.na(education_recoded) & !is.na(education_orig))# A tibble: 1 × 6
# Groups: UserLanguage, education_orig, education_cat, education_recoded_cat [1]
UserLanguage education_orig education_cat education_recoded_cat education_recoded n
<chr> <dbl> <chr> <ord> <dbl> <int>
1 ES-PER 5 Inclusive education <NA> NA 31
# Sanity check: Check Irish sponsored dataset
df_merged |>
filter(irl == 1) |>
group_by(
education_irl,
education_orig,
education_cat,
education_recoded_cat,
education_recoded) |> dplyr::summarise(n = dplyr::n())# A tibble: 8 × 6
# Groups: education_irl, education_orig, education_cat, education_recoded_cat [8]
education_irl education_orig education_cat education_recoded_cat education_recoded n
<chr> <dbl> <chr> <ord> <dbl> <int>
1 Degree 6 Degree University 4 355
2 Diploma 5 Diploma Technical 3 185
3 Doctorate 8 Doctorate Advanced 5 11
4 Junior (Inter) Certificate or Equivalent 2 Junior (Inter) Certificate or Equivalent Less than secondary 1 67
5 Leaving Certificate 3 Leaving Certificate Secondary 2 277
6 Less than Junior (Inter) Cert 1 Less than Junior (Inter) Cert Less than secondary 1 13
7 Master's 7 Master's Advanced 5 154
8 Technical or Vocational Certificate 4 Technical or Vocational Certificate Technical 3 138
Employment Status
Upon collaborators’ request, the option “Part-time student” was hidden from the versions KA-GEO and EN-GEO in Georgia and SR-SRB in Serbia. The option “Military service” was hidden from the version JA-JPN in Japan.
During the survey completion, participants were not allowed to select conflicting options:
- Employed full-time and part-time simultaneously.
- Student full-time and part-time simultaneously.
- Employed/working full-time or part-time and not in paid employment simultaneously.
- Military service and not in paid employment simultaneously.
- Military service and retired simultaneously.
- Retired and not in paid employment simultaneously.
- Not in paid employment by choice and looking for work or unable to work due to health/personal reasons simultaneously.
- Looking for work and unable to work due to health/personal reasons simultaneously.
Employment status was recoded using a sequential rule:
Militaryif the military service option was selected.Employed/working full-time (25+ hours per week)if the full-time employment option was selected.Employed/working part-time (less than 25 hours per week)if the part-time employment option was selected.Not in paid employment (looking for work)if the job-seeking option was selected and no conditions above were met.Student non-working (Full or part-time)if the full- or part-time student was selected and no conditions above were met.Not in paid employment (by choice/health)if not working by choice or for health reasons and no conditions above were met.Retiredif the retired option was selected and no conditions above were met.
$employment_orig
Which most accurately describes you at this moment? You may select up to two options in case you fit more than one category.
1 1,3 1,4 1,5 1,6 1,7 1,8 1,9 2 2,3 2,4 2,5 2,6 2,7 2,8 2,9 3 3,5 3,6 4 4,5 4,6 5 6 7
6511 962 984 70 18 328 671 114 1312 1440 628 34 28 113 267 54 30950 217 189 4737 47 182 589 2592 1839
8 9 <NA>
3233 1728 9571
Class: character
# Replace numeric values with descriptive labels
employment_labels <- c(
"1" = "Full-time student",
"2" = "Part-time student",
"3" = "Employed/working full-time (25+ hours per week)",
"4" = "Employed/working part-time (less than 25 hours per week)",
"5" = "Military service",
"6" = "Retired",
"7" = "Not in paid employment (by choice)",
"8" = "Not in paid employment (looking for work)",
"9" = "Not in paid employment (unable to work due to health/personal reasons)")
# Function to recode multiple-choice values
recode_employment <- function(i) {
# If row is NA, return NA
if (is.na(i)) return(NA_character_)
# Split the string by comma and map to labels
codes <- strsplit(i, ",")[[1]]
# Collapse the labels into a single string
paste(employment_labels[trimws(codes)], collapse = "; ")
}
df_merged <- df_merged |>
dplyr::mutate(
# Apply recoding function to create employment_cat variable
# so instead of "2,5", we have "Part-time student; Military service"
employment_cat =
stringr::str_squish(sapply(employment_orig, recode_employment)),
employment_primary = base::factor(
dplyr::case_when(
# Contains option 5
stringr::str_detect(employment_orig, fixed("5"))
~ "Military service",
# Contains option 3 AND do not contain option 5
stringr::str_detect(employment_orig, fixed("3")) &
!(stringr::str_detect(employment_orig, fixed("5")))
~ "Employed/working full-time (25+ hours per week)",
# Contains option 4 AND do not contain option 5
# (it was not possible to select options 3 and 4 simultaneously)
stringr::str_detect(employment_orig, fixed("4")) &
!(stringr::str_detect(employment_orig, fixed("5")))
~ "Employed/working part-time (less than 25 hours per week)",
# Contains option 8 AND do not contain option 5
# (it was not possible to select options 8 and 5, 3 or 4 simultaneously)
stringr::str_detect(employment_orig, fixed("8"))
~ "Not in paid employment (looking for work)",
# Contains option 1 or 2 AND do not contain option 5, 3, 4, or 8
(stringr::str_detect(employment_orig, fixed("1")) |
stringr::str_detect(employment_orig, fixed("2"))) &
!(stringr::str_detect(employment_orig, fixed("5"))) &
!(stringr::str_detect(employment_orig, fixed("3"))) &
!(stringr::str_detect(employment_orig, fixed("4"))) &
!(stringr::str_detect(employment_orig, fixed("8")))
~ "Student non-working (Full or part-time)",
# Contains option 7 or 9 AND do not contain option 1, or 2
# (it was not possible to select options 7 or 9
# and 8, 5, 3 or 4 simultaneously)
(stringr::str_detect(employment_orig, fixed("7")) |
stringr::str_detect(employment_orig, fixed("9"))) &
!(stringr::str_detect(employment_orig, fixed("1"))) &
!(stringr::str_detect(employment_orig, fixed("2")))
~ "Not in paid employment (by choice/health)",
# Contains option 6 AND do not contain option 5, 3, 4, 8, 1, 2, 7 or 9
# (it was not possible to select options 6 and 7, 8, 9, 5 simultaneously)
stringr::str_detect(employment_orig, fixed("6")) &
!(stringr::str_detect(employment_orig, fixed("3"))) &
!(stringr::str_detect(employment_orig, fixed("4"))) &
!(stringr::str_detect(employment_orig, fixed("1"))) &
!(stringr::str_detect(employment_orig, fixed("2")))
~ "Retired",
TRUE ~ NA_character_
),
levels = c(
"Not in paid employment (by choice/health)",
"Not in paid employment (looking for work)",
"Student non-working (Full or part-time)",
"Employed/working full-time (25+ hours per week)",
"Employed/working part-time (less than 25 hours per week)",
"Retired",
"Military service"
)
)
) |>
dplyr::relocate(employment_cat:employment_primary, .after = employment_orig)
# Sanity check: How many options were selected per participant?
df_merged |>
dplyr::mutate(number_of_options_selected =
if_else(is.na(employment_orig),
NA_integer_,
str_count(employment_orig, ",") + 1)) |>
count(number_of_options_selected)# A tibble: 3 × 2
number_of_options_selected n
<dbl> <int>
1 1 53491
2 2 6346
3 NA 9571
# Sanity check: View the distribution of primary employment
base::table(df_merged$employment_primary, useNA = "ifany")
Not in paid employment (by choice/health) Not in paid employment (looking for work)
3567 4171
Student non-working (Full or part-time) Employed/working full-time (25+ hours per week)
8478 33541
Employed/working part-time (less than 25 hours per week) Retired
6531 2592
Military service <NA>
957 9571
# Sanity check: Cross-tab between primary employment and original employment
print(table(df_merged$employment_primary,
df_merged$employment_orig, useNA = "ifany"), n = Inf)
1 1,3 1,4 1,5 1,6 1,7 1,8 1,9 2 2,3 2,4 2,5 2,6 2,7 2,8
Not in paid employment (by choice/health) 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Not in paid employment (looking for work) 0 0 0 0 0 0 671 0 0 0 0 0 0 0 267
Student non-working (Full or part-time) 6511 0 0 0 18 328 0 114 1312 0 0 0 28 113 0
Employed/working full-time (25+ hours per week) 0 962 0 0 0 0 0 0 0 1440 0 0 0 0 0
Employed/working part-time (less than 25 hours per week) 0 0 984 0 0 0 0 0 0 0 628 0 0 0 0
Retired 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Military service 0 0 0 70 0 0 0 0 0 0 0 34 0 0 0
<NA> 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2,9 3 3,5 3,6 4 4,5 4,6 5 6 7 8 9 <NA>
Not in paid employment (by choice/health) 0 0 0 0 0 0 0 0 0 1839 0 1728 0
Not in paid employment (looking for work) 0 0 0 0 0 0 0 0 0 0 3233 0 0
Student non-working (Full or part-time) 54 0 0 0 0 0 0 0 0 0 0 0 0
Employed/working full-time (25+ hours per week) 0 30950 0 189 0 0 0 0 0 0 0 0 0
Employed/working part-time (less than 25 hours per week) 0 0 0 0 4737 0 182 0 0 0 0 0 0
Retired 0 0 0 0 0 0 0 0 2592 0 0 0 0
Military service 0 0 217 0 0 47 0 589 0 0 0 0 0
<NA> 0 0 0 0 0 0 0 0 0 0 0 0 9571
# Sanity check: View the counts of each option
df_merged |>
dplyr::group_by(employment_orig, employment_cat, employment_primary) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "employment_orig", width = 800)Citizenship and Ethnicity
This item allowed participants to select multiple choices. The first eight options referred to ethnicity. Only some countries contained the options related to ethnicity.
The last three options referred to citizenship status. Participants were not allowed to select Citizen of [country] and Resident of [country] (non-citizen) simultaneously. All countries contained the citizenship options.
Citizenship
$ethnicity_citizenship_orig
Please choose which best describes you. You must select at least one option from the top part and at least one option from the bottom. - Selected Choice
1,10 1,11 1,2,10 1,2,11 1,2,3,10 1,2,3,4,10 1,2,3,4,5,10
24538 444 321 6 11 4 1
1,2,3,4,5,6,10 1,2,3,4,5,6,7,8,11 1,2,3,4,6,7,10 1,2,3,4,7,10 1,2,3,4,7,8,9,10 1,2,3,5,10 1,2,3,5,8,10
1 1 1 1 1 2 1
1,2,3,5,9,10 1,2,3,8,10 1,2,3,9,10 1,2,4,10 1,2,4,11 1,2,4,5,10 1,2,4,6,7,10
1 2 1 115 3 10 1
1,2,4,6,8,10 1,2,4,8,10 1,2,4,8,11 1,2,4,9,10 1,2,5,10 1,2,6,10 1,2,6,11
1 11 1 5 5 2 1
1,2,6,8,10 1,2,7,10 1,2,7,8,10 1,2,8,10 1,2,8,11 1,2,9 1,2,9,10
1 1 1 19 2 14 16
1,2,9,11 1,3,10 1,3,11 1,3,4,10 1,3,4,5,10 1,3,4,6,10 1,3,5,10
1 164 2 8 1 1 3
1,3,5,9 1,3,6,10 1,3,8,10 1,3,8,11 1,3,8,9,10 1,3,8,9,11 1,3,9
1 1 11 1 2 1 2
1,3,9,10 1,4,10 1,4,11 1,4,5,10 1,4,5,11 1,4,5,8,10 1,4,5,9
5 134 5 14 1 3 1
1,4,5,9,10 1,4,6,10 1,4,6,8,10 1,4,7,10 1,4,7,8,10 1,4,8,10 1,4,8,9,10
1 1 1 3 1 9 2
1,4,9 1,4,9,10 1,4,9,11 1,5,10 1,5,11 1,5,6,8,10 1,5,6,9,10
3 8 1 104 4 1 1
1,5,7,10 1,5,8,10 1,5,9 1,5,9,10 1,5,9,11 1,6,10 1,6,11
2 10 4 3 2 72 1
1,6,7,10 1,6,8,10 1,6,8,9 1,6,9,10 1,6,9,11 1,7,10 1,7,11
1 5 1 3 1 46 1
1,7,8,10 1,7,8,9,10 1,7,9 1,7,9,10 1,7,9,11 1,8,10 1,8,11
1 2 2 2 1 584 30
1,8,9 1,8,9,10 1,8,9,11 1,9 1,9,10 1,9,11 10
28 17 2 664 368 95 10450
11 2,10 2,11 2,3,10 2,3,11 2,3,4,5,6,9,10 2,3,4,5,7,10
415 2307 222 14 3 1 1
2,3,4,5,9 2,3,4,8,11 2,3,4,9,11 2,3,5,11 2,3,5,9,11 2,3,6,8,10 2,3,7,10
1 1 1 3 1 1 2
2,3,8,10 2,3,8,9,11 2,3,9 2,3,9,11 2,4,10 2,4,11 2,4,5,6,10
2 1 3 2 80 8 1
2,4,5,6,7,10 2,4,5,7,10 2,4,6,10 2,4,7,10 2,4,8,10 2,4,9 2,4,9,10
1 1 2 1 2 2 2
2,5,10 2,5,11 2,5,7,10 2,5,9,11 2,6,10 2,6,7,10 2,6,8,10
12 3 1 4 10 1 1
2,6,9 2,6,9,11 2,7,10 2,7,8,9,10 2,7,9 2,7,9,10 2,8,10
2 1 40 1 2 6 83
2,8,11 2,8,9 2,8,9,10 2,8,9,11 2,9 2,9,10 2,9,11
4 32 3 5 306 46 60
3,10 3,11 3,4,10 3,4,5,7,10 3,4,6,11 3,4,7,10 3,4,7,11
1406 64 18 1 1 3 1
3,4,8,10 3,4,8,9 3,4,9 3,4,9,10 3,4,9,11 3,5,10 3,5,6,9,10
2 1 2 2 1 10 1
3,5,7,10 3,5,7,9 3,5,9 3,6,10 3,7,10 3,7,9 3,7,9,10
1 1 1 11 22 1 2
3,8,10 3,8,11 3,8,9 3,8,9,10 3,8,9,11 3,9 3,9,10
53 6 5 4 1 127 36
3,9,11 4,10 4,11 4,5,10 4,5,11 4,5,7,10 4,5,8,10
11 2701 97 30 1 7 3
4,5,9 4,6,10 4,6,11 4,6,7,10 4,6,8,10 4,6,9 4,7,10
2 25 2 1 2 1 135
4,7,11 4,7,8,10 4,7,8,9 4,7,9 4,7,9,10 4,7,9,11 4,8,10
2 2 1 6 9 2 100
4,8,11 4,8,9 4,8,9,10 4,9 4,9,10 4,9,11 5,10
4 7 2 179 56 28 2617
5,11 5,6,10 5,6,9 5,7,10 5,7,8,10 5,8,10 5,8,11
45 2 1 46 4 48 2
5,8,9 5,8,9,10 5,8,9,11 5,9 5,9,10 5,9,11 6,10
5 4 1 90 34 20 1233
6,11 6,7,10 6,7,9 6,7,9,10 6,8,10 6,8,11 6,8,9
18 10 1 1 40 1 1
6,8,9,11 6,9 6,9,10 6,9,11 7,10 7,11 7,8,10
1 33 14 14 4584 42 34
7,8,9,11 7,9 7,9,10 7,9,11 8,10 8,11 8,9
1 137 108 34 1715 101 98
8,9,10 8,9,11 9 9,10 9,11 <NA>
47 38 401 338 269 9825
Class: character
# Create variables for each citizenship option (9 to 11)
df_merged <- df_merged |>
dplyr::mutate(
citizenship_cat = base::factor(
dplyr::case_when(
# When option 10 = "Citizen of [country]" was selected
# and option 9 = "Born outside [country]" was not selected.
# It was not possible to select options 10 and 11 simultaneously.
stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("10")) &
!(stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("9")))
~ "Citizen",
# When option 11 = "Resident of [country] (non-citizen)" was selected
# and option 9 = "Born outside [country]" was not selected.
# It was not possible to select options 10 and 11 simultaneously.
stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("11")) &
!(stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("9")))
~ "Non-citizen (Permanent Resident)",
# When option 9 = "Born outside [country]" was selected
# and option 10 = "Citizen of [country]" was selected.
# It was not possible to select options 10 and 11 simultaneously.
stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("9")) &
stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("10"))
~ "Born outside country (Citizen)",
# When option 9 = "Born outside [country]" was selected
# and option 11 = "Resident of [country] (non-citizen)" was selected.
# It was not possible to select options 10 and 11 simultaneously.
stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("9")) &
stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("11"))
~ "Born outside country (Non-citizen, Permanent Resident)",
# When only option 9 = "Born outside [country]" was selected.
stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("9")) &
!(stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("11"))) &
!(stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("10")))
~ "Born outside country (Non-citizen, Non-permanent Resident)",
TRUE ~ NA_character_
),
levels = c(
"Citizen",
"Non-citizen (Permanent Resident)",
"Born outside country (Citizen)",
"Born outside country (Non-citizen, Permanent Resident)",
"Born outside country (Non-citizen, Non-permanent Resident)")
)
) |>
dplyr::relocate(citizenship_cat, .after = ethnicity_citizenship_orig)
# Sanity check: View the distribution of citizenship categories
df_merged |>
dplyr::mutate(
# Extract only the citizenship options selected
citizenship_extract = stringr::str_extract_all(
ethnicity_citizenship_orig, "(?<=^|,)(9|10|11)(?=,|$)") |>
purrr::map_chr(\(i) {
if (length(i) == 0) return(NA_character_)
if (all(is.na(i))) return(NA_character_)
paste(i[!is.na(i)], collapse = ",")
})
) |>
dplyr::group_by(citizenship_extract, citizenship_cat) |>
dplyr::summarise(n = dplyr::n())# A tibble: 6 × 3
# Groups: citizenship_extract [6]
citizenship_extract citizenship_cat n
<chr> <fct> <int>
1 10 Citizen 54110
2 11 Non-citizen (Permanent Resident) 1549
3 9 Born outside country (Non-citizen, Non-permanent Resident) 2169
4 9,10 Born outside country (Citizen) 1155
5 9,11 Born outside country (Non-citizen, Permanent Resident) 600
6 <NA> <NA> 9825
Ethnicity
# Upload the ethnicity categories translated that were used for each country
ethnicity_cat <-
readr::read_csv("111_ethnicity_labels_translated.csv", show_col_types = FALSE) |>
dplyr::glimpse(width = 100)Rows: 533
Columns: 3
$ UserLanguage <chr> "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM", "AR-ARE", "AR-ARE", "AR-AR…
$ option_number <dbl> 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 1, 1, 2, 3, 4, 1, 2, 3, 1, 2, 3, 4, 5, 6, 1…
$ label <chr> "Armenians", "Ezidis", "Russians", "Assyrians", "Ukrainians", "Arab/Middle E…
# Extract the ethnicity options
df_merged <- df_merged |>
dplyr::mutate(
# Extract the ethnicity options
# Don't extract option 8 = "Specify: [open text field]"
# because that will be added later
ethnicity_agg = stringr::str_extract_all(
ethnicity_citizenship_orig,
"(?<=^|,)(1|2|3|4|5|6|7)(?=,|$)"
) |>
purrr::map_chr(\(i) {
# Participants that did not complete this item should have NA.
# Participants that completed a survey version
# without ethnicity options should have NA.
if (length(i) == 0) return(NA_character_)
if (all(is.na(i))) return(NA_character_)
paste(i[!is.na(i)], collapse = ",")
})
)
# Replace ethnicity options with the translated labels
# Transformation will be conducted in a temporary data frame for safety
df_temp <- df_merged |>
# Remove missing values for this transformation
dplyr::filter(!is.na(ethnicity_agg)) |>
# Separate values into rows
# (if participant wrote "1,2", create two rows: one with "1" and another with "2")
tidyr::separate_rows(ethnicity_agg, sep = ",") |>
# Create variable that is going to match with ethnicity_cat
dplyr::mutate(option_number = as.numeric(stringr::str_trim(ethnicity_agg))) |>
# Join ethnicity_cat to get the translated labels
dplyr::left_join(ethnicity_cat, by = c("UserLanguage", "option_number")) |>
# Bring back to former format of having multiple options in a single row
# but now with the translated labels instead of numbers
dplyr::group_by(ResponseId) |>
dplyr::summarise(
ethnicity_translated = base::paste(label[!is.na(label)], collapse = ",")
)
# Join back to main data frame
nrow(df_merged)[1] 69408
[1] 69408
# Cleanup
rm(df_temp)
# Sanity check: Are the number of missing values in the new variable the same
# as in the original variable plus those that only selected citizenship options
# or only the please specify option (8)?
sum(is.na(df_merged$ethnicity_translated)) ==
(sum(is.na(df_merged$ethnicity_citizenship_orig)) + sum(
!is.na(df_merged$ethnicity_citizenship_orig) &
stringr::str_detect(
df_merged$ethnicity_citizenship_orig,
"^(?:\\s*(?:8|9|10|11)\\s*)(?:,\\s*(?:8|9|10|11)\\s*)*$"
)
))[1] TRUE
# Add the cleaned responses from the "Specify: [open text field]" option (8)
ethnicity_recoded <-
readr::read_csv("111_ethnicity_open_answers_recoded.csv", show_col_types = FALSE) |>
dplyr::glimpse(width = 100)Rows: 3,274
Columns: 2
$ ResponseId <chr> "R_8FrYunIVSiVeX5B", "R_8rYZOG6u8qXwprj", "R_8p9yE9TFIjGUonc", "R_2Lzosf…
$ ethnicity_specify <chr> "Cannot determine", "Cannot determine", "Cannot determine", "Cannot dete…
df_merged <- df_merged |>
dplyr::left_join(ethnicity_recoded, by = "ResponseId") |>
dplyr::relocate(ethnicity_agg:ethnicity_specify, .after = ethnicity_citizenship_orig)
# Sanity check
dplyr::glimpse(df_merged |>
dplyr::group_by(ethnicity_citizenship_orig, UserLanguage) |>
dplyr::distinct(ethnicity_citizenship_orig, UserLanguage,
ethnicity_agg, ethnicity_translated, ethnicity_specify,
.keep_all = TRUE) |>
dplyr::ungroup() |>
dplyr::select(UserLanguage, ethnicity_citizenship_orig,
ethnicity_agg, ethnicity_translated, ethnicity_specify),
width = 100)Rows: 2,567
Columns: 5
$ UserLanguage <chr> "FR-SEN", "FR-SEN", "PT-BRA", "PT-BRA", "PT-BRA", "FIL-PHL", "P…
$ ethnicity_citizenship_orig <chr> "3,6,10", "1,10", "5,10", "3,10", "1,10", "1,10", "1,9", "2,3,5…
$ ethnicity_agg <chr> "3,6", "1", "5", "3", "1", "1", "1", "2,3,5", "2", "2", "1,2", …
$ ethnicity_translated <chr> "Diola / Malinké,Haalpulaaren", "Wolof / Lébou", "White", "Blac…
$ ethnicity_specify <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Mo…
# View values
df_merged |>
dplyr::filter(!is.na(ethnicity_citizenship_orig)) |>
dplyr::mutate(
ethnicity_extract = stringr::str_extract_all(
ethnicity_citizenship_orig,
"(?<=^|,)(1|2|3|4|5|6|7|8)(?=,|$)"
) |>
purrr::map_chr(\(i) {
if (length(i) == 0) return(NA_character_)
if (all(is.na(i))) return(NA_character_)
paste(i[!is.na(i)], collapse = ",")
})
) |>
dplyr::group_by(country, ethnicity_extract, ethnicity_agg,
ethnicity_translated, ethnicity_specify) |>
dplyr::summarise(n = dplyr::n(), .groups = "drop") |>
dplyr::arrange(country) |>
print_reactable(sorted_col = "country", width = 800)Honeypot for Bots
An item was added to the survey, and via JavaScript code, the item was hidden from human participants. Bots that do not compute JavaScript code would process this item and provide an answer, which would allow us to identify them.
Optional Ending
This item was optional and can be NA. Also, this item was not shown to sponsored participants.
Childhood Socioeconomic Status
$childhood_SES
As a child, how would you describe the financial situation in your household compared to a typical home where you grew up?
1 2 3 4 5 <NA>
4564 10353 15104 8711 1219 29457
Class: numeric
# Create categorical variable with labels
df_merged <- df_merged |>
dplyr::mutate(
childhood_SES_cat = base::factor(
dplyr::case_when(
childhood_SES == 1 ~ "Poor",
childhood_SES == 2 ~ "Below average but not poor",
childhood_SES == 3 ~ "Around average",
childhood_SES == 4 ~ "Above average but not wealthy",
childhood_SES == 5 ~ "Wealthy",
TRUE ~ NA_character_
),
levels = c(
"Poor",
"Below average but not poor",
"Around average",
"Above average but not wealthy",
"Wealthy"
),
ordered = TRUE)) |>
dplyr::relocate(childhood_SES_cat, .after = childhood_SES)
# Sanity check: View the distribution of the new variable
base::table(df_merged$childhood_SES_cat, useNA = "always")
Poor Below average but not poor Around average Above average but not wealthy Wealthy
4564 10353 15104 8711 1219
<NA>
29457
Financial Outlook and Confidence
# Nothing to do here, the item is already numeric and
# within the minimum and maximum values.
table_label(df_merged$fin_outlook)$fin_outlook
What is your expectation for how things will be for you financially one year from now?
1 2 3 4 5 <NA>
1663 3952 13487 14794 5958 29554
Class: numeric
$fin_outlook_conf
On a scale from 1 (completely uncertain) to 10 (completely certain), how confident are you in your answer to the last question?
1 2 3 4 5 6 7 8 9 10 <NA>
995 527 1014 1732 4371 4260 6809 8042 4590 7514 29554
Class: numeric
df_merged <- df_merged |>
dplyr::mutate(
fin_outlook_cat = base::factor(
dplyr::case_when(
fin_outlook == 1 ~ "Things will be much worse",
fin_outlook == 2 ~ "Things will be somewhat worse",
fin_outlook == 3 ~ "Things will be about the same",
fin_outlook == 4 ~ "Things will be somewhat better",
fin_outlook == 5 ~ "Things will be much better",
TRUE ~ NA_character_
),
levels = c(
"Things will be much worse",
"Things will be somewhat worse",
"Things will be about the same",
"Things will be somewhat better",
"Things will be much better"
),
ordered = TRUE
)
) |>
dplyr::relocate(fin_outlook_cat, .after = fin_outlook)Attention and Care
$attention_care
There are people that care about and pay attention to what goes on in my life.
1 2 3 4 5 6 7 <NA>
836 838 1949 4431 13507 9044 9214 29589
Class: numeric
df_merged <- df_merged |>
dplyr::mutate(
attention_care_cat = base::factor(
dplyr::case_when(
attention_care == 1 ~ "Completely disagree",
attention_care == 2 ~ "Strongly disagree",
attention_care == 3 ~ "Disagree",
attention_care == 4 ~ "Neutral",
attention_care == 5 ~ "Agree",
attention_care == 6 ~ "Strongly agree",
attention_care == 7 ~ "Completely agree",
TRUE ~ NA_character_
),
levels = c(
"Completely disagree",
"Strongly disagree",
"Disagree",
"Neutral",
"Agree",
"Strongly agree",
"Completely agree"
),
ordered = TRUE)) |>
dplyr::relocate(attention_care_cat, .after = attention_care)
# Sanity check: View the distribution of the new variable
df_merged |>
dplyr::count(attention_care, attention_care_cat)# A tibble: 8 × 3
attention_care attention_care_cat n
<dbl> <ord> <int>
1 1 Completely disagree 836
2 2 Strongly disagree 838
3 3 Disagree 1949
4 4 Neutral 4431
5 5 Agree 13507
6 6 Strongly agree 9044
7 7 Completely agree 9214
8 NA <NA> 29589
Workplace Arragement
$work_arrangement
Which most accurately describes your current work (or study) arrangement?
1 2 3 4 5 <NA>
17717 6169 3558 3226 2890 35848
Class: numeric
# Create categorical variable with labels
df_merged <- df_merged %>%
dplyr::mutate(
work_arrangement_cat = base::factor(
dplyr::case_when(
work_arrangement == 1
~ "I work entirely in-person (i.e., in an office, on-site)",
work_arrangement == 2
~ "I mostly work in-person, with occasional remote days",
work_arrangement == 3
~ "I work about evenly in-person/remote",
work_arrangement == 4
~ "I mostly work remotely, with occasional in-person days",
work_arrangement == 5
~ "I work entirely remotely",
TRUE ~ NA_character_
),
levels = c(
"I work entirely in-person (i.e., in an office, on-site)",
"I mostly work in-person, with occasional remote days",
"I work about evenly in-person/remote",
"I mostly work remotely, with occasional in-person days",
"I work entirely remotely"
),
ordered = TRUE
),
work_arrangement_cat_nostudents = base::factor(
dplyr::if_else(
employment_primary == "Student non-working (Full or part-time)",
NA_character_,
as.character(work_arrangement_cat)
),
levels = levels(work_arrangement_cat),
ordered = TRUE
)
) |>
dplyr::relocate(work_arrangement_cat,
work_arrangement_cat_nostudents,
.after = work_arrangement)
# Sanity check: View the distribution of the new variable
df_merged |>
dplyr::group_by(work_arrangement, work_arrangement_cat) |>
dplyr::summarise(n = dplyr::n())# A tibble: 6 × 3
# Groups: work_arrangement [6]
work_arrangement work_arrangement_cat n
<dbl> <ord> <int>
1 1 I work entirely in-person (i.e., in an office, on-site) 17717
2 2 I mostly work in-person, with occasional remote days 6169
3 3 I work about evenly in-person/remote 3558
4 4 I mostly work remotely, with occasional in-person days 3226
5 5 I work entirely remotely 2890
6 NA <NA> 35848
# Sanity check: View the distribution of the new variable excluding students
base::table(df_merged$employment_primary,
df_merged$work_arrangement_cat_nostudents, useNA = "always")
I work entirely in-person (i.e., in an office, on-site)
Not in paid employment (by choice/health) 0
Not in paid employment (looking for work) 295
Student non-working (Full or part-time) 0
Employed/working full-time (25+ hours per week) 13053
Employed/working part-time (less than 25 hours per week) 1898
Retired 0
Military service 234
<NA> 0
I mostly work in-person, with occasional remote days I work about evenly in-person/remote
Not in paid employment (by choice/health) 0 0
Not in paid employment (looking for work) 118 79
Student non-working (Full or part-time) 0 0
Employed/working full-time (25+ hours per week) 4363 2251
Employed/working part-time (less than 25 hours per week) 773 581
Retired 0 0
Military service 51 48
<NA> 0 0
I mostly work remotely, with occasional in-person days I work entirely remotely <NA>
Not in paid employment (by choice/health) 0 0 3567
Not in paid employment (looking for work) 71 97 3511
Student non-working (Full or part-time) 0 0 8478
Employed/working full-time (25+ hours per week) 2051 1571 10252
Employed/working part-time (less than 25 hours per week) 586 632 2061
Retired 0 0 2592
Military service 37 28 559
<NA> 0 0 9571
Identification of Sponsored Participants
$br
id
5 <NA>
6445 62963
Class: numeric
$bs
pay
1 <NA>
1 69407
Class: numeric
0 1 <NA>
68208 1200 0
Class: numeric
# Create a new variable to identify sponsored participants
df_merged <- df_merged |>
dplyr::mutate(
sponsored = dplyr::if_else(
!is.na(br) | !is.na(bs) | irl == 1, 1, 0
)
)
# Sanity check
base::table(df_merged$sponsored, useNA = "always")
0 1 <NA>
61762 7646 0
A0.2. Applying exclusion criteria
Direct exclusion criteria
# Identify exclusion criteria and assign status
df_merged <- df_merged |>
# Create explicit flags for each rule
dplyr::mutate(
incomplete = is.na(debts_orig) & irl == 0,
# E1. Not resident based on manual checking of location validation
# important to note that the USA version was the default
# when the survey link was broken or shared without specifying a country
# in the URL metadata parameters.
not_resident = loc_resident == 0,
# E2. Implausible combination of working (3, 4, or 5 on employment)
# and reporting zero income.
working_zero_income =
(stringr::str_detect(employment_orig, "\\b(3|4|5)\\b")) &
(income_orig == 0 | income_text_clean == 0),
# E3. Implausible combination of being retired (6 on employment)
# and having an age <= 25
retired_young =
(stringr::str_detect(employment_orig, "\\b6\\b")) &
(!is.na(age) & age <= 25),
# E4. Implausible combination of reporting
# very high MPWB (well-being) and very high PHQ4 (distress)
extremes_mpwb_phq4 =
!is.na(gad_worry) &
(mpwb_sum >= 65 & phq4_sum >= 24),
# E5. Respondents reporting high MPWB (well-being) and high PHQ-4 (distress),
# combined with unusually short adjusted completion time.
high_mpwb_phq4_speed =
!is.na(gad_worry) &
!is.na(duration_adj) &
(mpwb_sum >= 64 & phq4_sum >= 23 & duration_adj < 10),
# E6. Too-fast based on raw duration,
# except sponsored participants from Ireland (who don't have duration data)
too_fast_raw = duration_sec < 150 & irl == 0,
# E7. We observed a China-specific pattern of
# unusually fast completion times and low response variance.
china_too_fast_low_var =
country == "China" &
duration_adj < 10 &
mpwb_var < 1
) |>
# Assign status based on the ordered exclusion criteria
# (first match is assigned and the rest ignored)
dplyr::mutate(
valid_status = base::factor(dplyr::case_when(
incomplete ~ "incomplete",
not_resident ~ "not residents",
working_zero_income ~ "implausible working with no income",
retired_young ~ "implausible retired young",
extremes_mpwb_phq4 ~ "implausible extremes",
high_mpwb_phq4_speed ~ "implausible high scores with speed",
too_fast_raw ~ "too fast general",
china_too_fast_low_var ~ "too fast low var",
TRUE ~ "passed"
),
levels = c(
"incomplete",
"not residents",
"implausible working with no income",
"implausible retired young",
"implausible extremes",
"implausible high scores with speed",
"too fast general",
"too fast low var",
"passed")
)
)
# Sanity checks: Overall counts per status
base::table(df_merged$valid_status, useNA = "always")
incomplete not residents implausible working with no income implausible retired young
12181 705 271 39
implausible extremes implausible high scores with speed too fast general too fast low var
48 7 1595 737
passed <NA>
53825 0
# Sanity checks: Check counts for incomplete
df_merged |>
dplyr::filter(incomplete) |>
dplyr::group_by(valid_status, Finished, debts_orig, phq_interest) |>
dplyr::summarise(max_progress = max(Progress), n_incomplete = dplyr::n())# A tibble: 1 × 6
# Groups: valid_status, Finished, debts_orig [1]
valid_status Finished debts_orig phq_interest max_progress n_incomplete
<fct> <dbl> <chr> <dbl> <dbl> <int>
1 incomplete 0 <NA> NA 77 12181
# Sanity checks: Check counts for not residents
df_merged |>
dplyr::filter(not_resident & !incomplete) |>
dplyr::group_by(valid_status, country, loc_country) |>
dplyr::summarise(n_not_resident = dplyr::n()) |>
dplyr::arrange(country) |>
print_reactable(sorted_col = "country", width = 500)# Sanity checks: Check counts for participants working with zero income
df_merged |>
dplyr::filter(working_zero_income & !not_resident & !incomplete) |>
dplyr::group_by(valid_status, employment_cat, income_orig, income_text_orig) |>
dplyr::summarise(n_working_zero_income = dplyr::n()) |>
dplyr::arrange(-n_working_zero_income)# A tibble: 15 × 5
# Groups: valid_status, employment_cat, income_orig [15]
valid_status employment_cat income_orig income_text_orig n_working_zero_income
<fct> <chr> <dbl> <chr> <int>
1 implausible working with no income Employed/working full-time (25+ hours per week) 0 <NA> 119
2 implausible working with no income Employed/working part-time (less than 25 hours per week) 0 <NA> 45
3 implausible working with no income Employed/working full-time (25+ hours per week) 10 0 20
4 implausible working with no income Full-time student; Employed/working part-time (less than 25 … 0 <NA> 20
5 implausible working with no income Full-time student; Employed/working full-time (25+ hours per… 0 <NA> 14
6 implausible working with no income Part-time student; Employed/working part-time (less than 25 … 0 <NA> 13
7 implausible working with no income Part-time student; Employed/working full-time (25+ hours per… 0 <NA> 12
8 implausible working with no income Military service 0 <NA> 10
9 implausible working with no income Full-time student; Military service 0 <NA> 6
10 implausible working with no income Employed/working part-time (less than 25 hours per week) 10 0 3
11 implausible working with no income Part-time student; Military service 0 <NA> 3
12 implausible working with no income Employed/working full-time (25+ hours per week); Military se… 0 <NA> 2
13 implausible working with no income Military service 10 0 2
14 implausible working with no income Full-time student; Employed/working full-time (25+ hours per… 10 0 1
15 implausible working with no income Part-time student; Employed/working part-time (less than 25 … 10 0 1
# Sanity checks: Check counts for retired young participants
df_merged |>
dplyr::filter(retired_young &
!working_zero_income & !not_resident & !incomplete) |>
dplyr::group_by(valid_status, employment_orig, age_group) |>
dplyr::summarise(n_retired_young = dplyr::n()) |>
dplyr::arrange(-n_retired_young)# A tibble: 4 × 4
# Groups: valid_status, employment_orig [4]
valid_status employment_orig age_group n_retired_young
<fct> <chr> <fct> <int>
1 implausible retired young 6 18-25 27
2 implausible retired young 1,6 18-25 6
3 implausible retired young 2,6 18-25 2
4 implausible retired young 3,6 18-25 2
# Sanity checks: Check counts for extremes in MPWB and PHQ4
df_merged |>
dplyr::filter(extremes_mpwb_phq4 & !retired_young &
!working_zero_income & !not_resident & !incomplete) |>
dplyr::group_by(valid_status, mpwb_sum, phq4_sum) |>
dplyr::summarise(n_extremes_mpwb_phq4 = dplyr::n()) |>
dplyr::arrange(-n_extremes_mpwb_phq4)# A tibble: 6 × 4
# Groups: valid_status, mpwb_sum [3]
valid_status mpwb_sum phq4_sum n_extremes_mpwb_phq4
<fct> <dbl> <dbl> <int>
1 implausible extremes 70 28 12
2 implausible extremes 69 28 2
3 implausible extremes 67 25 1
4 implausible extremes 69 24 1
5 implausible extremes 70 25 1
6 implausible extremes 70 26 1
# Sanity checks: Check counts for participants with high scores on mpwb and phq4,
# plus unusual speed
df_merged |>
dplyr::filter(high_mpwb_phq4_speed & !extremes_mpwb_phq4 & !retired_young &
!working_zero_income & !not_resident & !incomplete) |>
dplyr::group_by(valid_status, mpwb_sum, phq4_sum, duration_adj) |>
dplyr::summarise(n_high_mpwb_phq4_speed = dplyr::n()) |>
dplyr::arrange(-n_high_mpwb_phq4_speed)# A tibble: 3 × 5
# Groups: valid_status, mpwb_sum, phq4_sum [3]
valid_status mpwb_sum phq4_sum duration_adj n_high_mpwb_phq4_speed
<fct> <dbl> <dbl> <dbl> <int>
1 implausible high scores with speed 64 23 7.5 1
2 implausible high scores with speed 64 28 5.32 1
3 implausible high scores with speed 66 23 6.04 1
# Sanity checks: Check counts for participants with high scores on mpwb and phq4,
# plus unusual speed
df_merged |>
dplyr::filter(too_fast_raw & !high_mpwb_phq4_speed &
!extremes_mpwb_phq4 & !retired_young &
!working_zero_income & !not_resident & !incomplete) |>
dplyr::group_by(valid_status) |>
dplyr::summarise(
min(duration_sec), max(duration_sec), n_too_fast_raw = dplyr::n())# A tibble: 1 × 4
valid_status `min(duration_sec)` `max(duration_sec)` n_too_fast_raw
<fct> <dbl> <dbl> <int>
1 too fast general 48 149 718
# Sanity checks: Check counts for China-specific exclusion
df_merged |>
dplyr::filter(china_too_fast_low_var & !too_fast_raw &
!high_mpwb_phq4_speed & !extremes_mpwb_phq4 & !retired_young &
!working_zero_income & !not_resident & !incomplete) |>
dplyr::group_by(valid_status, country) |>
dplyr::summarise(
min(mpwb_var), max(mpwb_var),
min(duration_adj), max(duration_adj), n_china_too_fast_low_var = dplyr::n())# A tibble: 1 × 7
# Groups: valid_status [1]
valid_status country `min(mpwb_var)` `max(mpwb_var)` `min(duration_adj)` `max(duration_adj)` n_china_too_fast_low_var
<fct> <chr> <dbl> <dbl> <dbl> <dbl> <int>
1 too fast low var China 0 0.989 5 9.95 439
Assessments to the Financial variables
Collaborators reviewed the financial variables and created flags indicating whether the responses were valid or not. Basic demographic information about the participants was given only upon request to assist with the review. The income, assets, and debts values that fell within the first income bracket and the last bracket were considered valid by default.
In Zimbabwe, all values were sent for review because there was a concern that participants reported values in Zimbabwean dollar instead of USD as collaborators used in the translation. Also, 14 participants from USA with a value equal to the first income bracket should have been accepted automatically but were sent for review by mistake.
Collaborators were also asked to provide a minimum cut-off for each variable. When the minimum cut-off was higher than the first income bracket, their sheet was updated with the values between the first income bracket and the minimum cut-off. Values of 0 in either financial variable were automatically accepted as is and were not given to collaborators for revision. The values that contained NA, “,” or “.” were also requested for review in order to validate our cleaning script.
The countries where sociodemographic information were provided were: Albania, Bangladesh, Finland, Georgia, Japan, Latvia, Lebanon, Oman, Peru, Portugal, Qatar, Russia, Singapore, Switzerland, Timor-Leste, Ukraine, USA, and Zimbabwe.
This assessment was not conducted for the sponsored participants from Ireland, as they did not provide open field answers regarding income, and were not asked to report assets and debts.
# A manual revision of the values was conducted before the sheet was given to
# collaborators.
df_clean <- df_clean |>
dplyr::mutate(
fin_valid_aut_income =
dplyr::case_when(
# For participants that selected a decile
# instead of providing an open text answer, consider them accepted
income_orig < 10 ~ 1,
# Values of 0 are automatically accepted as is.
income_text_clean == 0 ~ 1,
# If value contains "," or "." or other non-digit, consider them not accepted,
# so collaborators can review them.
!(stringr::str_detect(income_text_orig, "^[0-9]+$")) ~ 0,
# If we detected a weird number, consider them not accepted.
income_wrd ~ 0,
# If value is above 0 but below the income first bracket,
# consider them not accepted.
irl == 0 &
!is.na(income_text_orig) &
income_text_clean != 0 &
income_text_clean > 0 & income_text_clean < income_highpoint_1 ~ 0,
# For all other participants, execute automatic assessment:
# The values that were within the income first bracket
# and the value of the last income bracket were considered not accepted.
irl == 0 &
!is.na(income_text_orig) &
income_text_clean != 0 &
income_text_clean >= income_highpoint_1 &
income_text_clean <= income_lowpoint_9 ~ 1,
# For values above the last income bracket, consider them not accepted.
irl == 0 &
!is.na(income_text_orig) &
income_text_clean != 0 &
income_text_clean > income_lowpoint_9 ~ 0,
TRUE ~ NA_real_
),
fin_valid_aut_assets =
dplyr::case_when(
# Sponsored participants from Ireland are assigned NA
# because no open text answers were collected from them.
irl == 1 ~ NA_real_,
assets_clean == 0 ~ 1,
!(stringr::str_detect(assets_orig, "^[0-9]+$")) ~ 0,
assets_wrd ~ 0,
irl == 0 &
(!is.na(assets_orig) &
assets_clean > 0 & assets_clean < income_highpoint_1) ~ 0,
irl == 0 &
(!is.na(assets_orig) &
assets_clean != 0 &
assets_clean >= income_highpoint_1 &
assets_clean <= income_lowpoint_9) ~ 1,
irl == 0 &
(!is.na(assets_orig) &
assets_clean != 0 &
assets_clean > income_lowpoint_9) ~ 0,
TRUE ~ NA_real_
),
fin_valid_aut_debts =
dplyr::case_when(
irl == 1 ~ NA_real_,
debts_clean == 0 ~ 1,
!(stringr::str_detect(debts_orig, "^[0-9]+$")) ~ 0,
debts_wrd ~ 0,
irl == 0 &
(!is.na(debts_orig) &
debts_clean > 0 & debts_clean < income_highpoint_1) ~ 0,
irl == 0 &
(!is.na(debts_orig) &
debts_clean != 0 &
debts_clean >= income_highpoint_1 &
debts_clean <= income_lowpoint_9) ~ 1,
irl == 0 &
(!is.na(debts_orig) &
debts_clean != 0 &
debts_clean > income_lowpoint_9) ~ 0,
TRUE ~ NA_real_
)
)
# Examine if the minimum cut-off provided is higher than the first income bracket.
df_clean <- df_clean |>
dplyr::mutate(
income_above_cutoff = income_cutoff_min > income_highpoint_1,
assets_above_cutoff = assets_cutoff_min > income_highpoint_1,
debts_above_cutoff = debts_cutoff_min > income_highpoint_1,
fin_valid_aut_income_update =
dplyr::case_when(
income_above_cutoff == FALSE ~ fin_valid_aut_income,
income_above_cutoff == TRUE &
income_text_clean >= income_highpoint_1 &
income_text_clean < income_cutoff_min ~ 0,
TRUE ~ fin_valid_aut_income
),
fin_valid_aut_assets_update =
dplyr::case_when(
assets_above_cutoff == FALSE ~ fin_valid_aut_assets,
assets_above_cutoff == TRUE &
assets_clean >= income_highpoint_1 &
assets_clean < assets_cutoff_min ~ 0,
TRUE ~ fin_valid_aut_assets
),
fin_valid_aut_debts_update =
dplyr::case_when(
debts_above_cutoff == FALSE ~ fin_valid_aut_debts,
debts_above_cutoff == TRUE &
debts_clean >= income_highpoint_1 &
debts_clean < debts_cutoff_min ~ 0,
TRUE ~ fin_valid_aut_debts
)
)
# Sanity check: View the counts of automatic financial validity
base::table(df_clean$fin_valid_aut_income, useNA = "always")
0 1 <NA>
3185 50531 109
0 1 <NA>
27728 24897 1200
0 1 <NA>
13503 39122 1200
After we transmitted the values that were not automatically classified to collaborators in each country for review, we received back their assessments. We have extracted automatically the sheet with their assessments, and combined them into a single file.
# Extract sections from Excel files in folder "777_countries_documentation"
files <- list.files(
path = "777_countries_documentation",
pattern = "\\.xls[x]?$",
full.names = TRUE) |>
purrr::discard(
# Exclude files named 777_Zambia and 777_Global
~stringr::str_detect(basename(.x),"^777_(Zambia|Global)"))
process_sheet <- function(path, sheet_name, start_row, tab_label) {
sheet_all <- readxl::read_excel(path, sheet = sheet_name, col_names = FALSE)
section <- sheet_all |> dplyr::slice(start_row:nrow(sheet_all)) |> dplyr::select(1:9)
# drop header row
section <- section |> dplyr::slice(-1)
names(section) <- c(
"ResponseId",
"UserLanguage",
"orig",
"clean",
"classification",
"value",
"cutoff_max",
"cutoff_min",
"notes"
)
section <- section |>
dplyr::mutate(
file = tools::file_path_sans_ext(basename(path)),
tab = tab_label
)
section
}
assessment_fin <- purrr::map_dfr(files, function(path) {
d1 <- process_sheet(path, "HOUSEHOLD INCOME", 22, "income")
d2 <- process_sheet(path, "ASSETS", 9, "assets")
d3 <- process_sheet(path, "DEBTS", 9, "debts")
dplyr::bind_rows(d1, d2, d3)
}) |>
dplyr::rename(
change = value
) |>
dplyr::mutate(
clean = base::as.numeric(clean),
cutoff_max = base::as.numeric(cutoff_max),
cutoff_min = base::as.numeric(cutoff_min)
) |>
tidyr::pivot_wider(
id_cols = c("ResponseId", "UserLanguage"),
names_from = "tab",
values_from = c(
"change",
"classification",
"cutoff_min",
"cutoff_max",
"orig",
"clean"
),
names_sep = "_"
)Rows: 33,245
Columns: 20
$ ResponseId <chr> "R_2S9d1LQe5gzhMGp", "R_2duaXZQf76tNTnX", "R_8YEzJo4GF1VSJiU", "R_8D…
$ UserLanguage <chr> "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "S…
$ change_income <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ change_assets <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ change_debts <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ classification_income <chr> "Not possible/not believable", "Not possible/not believable", "Not p…
$ classification_assets <chr> "OK", "OK", NA, "OK", "OK", "OK", "OK", "OK", "OK", NA, NA, NA, NA, …
$ classification_debts <chr> "OK", NA, "Cannot determine", NA, "Cannot determine", NA, NA, NA, NA…
$ cutoff_min_income <dbl> 12000, 12000, 12000, 12000, 12000, 12000, 12000, 12000, 12000, 12000…
$ cutoff_min_assets <dbl> 1000, 1000, NA, 1000, 1000, 1000, 1000, 1000, 1000, NA, NA, NA, NA, …
$ cutoff_min_debts <dbl> 1000, NA, 1000, NA, 1000, NA, NA, NA, NA, NA, NA, NA, NA, 1000, NA, …
$ cutoff_max_income <dbl> 2500000, 2500000, 2500000, 2500000, 2500000, 2500000, 2500000, 25000…
$ cutoff_max_assets <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cutoff_max_debts <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ orig_income <chr> "100", "410", "500", "600", "700", "800", "900", "1200", "2000", "30…
$ orig_assets <chr> "10000", "100000", NA, "10000", "5000", "5000", "3000", "500.000", "…
$ orig_debts <chr> "5000", NA, "500", NA, "250", NA, NA, NA, NA, NA, NA, NA, NA, "30000…
$ clean_income <dbl> 100, 410, 500, 600, 700, 800, 900, 1200, 2000, 3000, 3840, 5000, 700…
$ clean_assets <dbl> 10000, 100000, NA, 10000, 5000, 5000, 3000, 500000, 2000, NA, NA, NA…
$ clean_debts <dbl> 5000, NA, 500, NA, 250, NA, NA, NA, NA, NA, NA, NA, NA, 30000, NA, N…
# Sanity check: Compare the cut-off min values
fin_cut_income <- assessment_fin |> dplyr::group_by(UserLanguage, cutoff_min_income) |> dplyr::summarise() |> dplyr::filter(!is.na(cutoff_min_income)) |> dplyr::rename(income_cutoff_min = cutoff_min_income)
df_cut_income <- df_clean |> dplyr::group_by(UserLanguage, income_cutoff_min) |> dplyr::summarise() |> dplyr::filter(!is.na(income_cutoff_min) & UserLanguage %in% fin_cut_income$UserLanguage)
dplyr::setequal(fin_cut_income, df_cut_income)[1] TRUE
fin_cut_assets <- assessment_fin |> dplyr::group_by(UserLanguage, cutoff_min_assets) |> dplyr::summarise() |> dplyr::filter(!is.na(cutoff_min_assets)) |> dplyr::rename(assets_cutoff_min = cutoff_min_assets)
df_cut_assets <- df_clean |> dplyr::group_by(UserLanguage, assets_cutoff_min) |> dplyr::summarise() |> dplyr::filter(!is.na(assets_cutoff_min) & UserLanguage %in% fin_cut_assets$UserLanguage)
dplyr::setequal(fin_cut_assets, df_cut_assets)[1] TRUE
fin_cut_debts <- assessment_fin |> dplyr::group_by(UserLanguage, cutoff_min_debts) |> summarise() |> dplyr::filter(!is.na(cutoff_min_debts)) |> dplyr::rename(debts_cutoff_min = cutoff_min_debts)
df_cut_debts <- df_clean |> dplyr::group_by(UserLanguage, debts_cutoff_min) |> dplyr::summarise() |> filter(!is.na(debts_cutoff_min) & UserLanguage %in% fin_cut_debts$UserLanguage)
dplyr::setequal(fin_cut_debts, df_cut_debts)[1] TRUE
# Sanity check:
# Are there any UserLanguage in assessment_fin that are not in df_merged?
base::setdiff(
unique(assessment_fin$UserLanguage),
unique(df_merged$UserLanguage)
)character(0)
# Sanity check:
# All values between clean_income in assessment_fin
# and income_text_clean in df_clean match?
assessment_fin |> select(ResponseId, clean_income) |> filter(!is.na(clean_income)) |>
dplyr::left_join(
df_clean |> select(ResponseId, income_text_clean),
by = "ResponseId"
) |>
dplyr::mutate(match = clean_income == income_text_clean) |> dplyr::group_by(match) |> dplyr::summarise(n = dplyr::n())# A tibble: 1 × 2
match n
<lgl> <int>
1 TRUE 3183
# All values between clean_assets in assessment_fin
# and assets_clean in df_clean match?
assessment_fin |> select(ResponseId, clean_assets) |> filter(!is.na(clean_assets)) |>
dplyr::left_join(
df_clean |> select(ResponseId, assets_clean),
by = "ResponseId"
) |>
dplyr::mutate(match = clean_assets == assets_clean) |> dplyr::group_by(match) |> dplyr::summarise(n = dplyr::n())# A tibble: 1 × 2
match n
<lgl> <int>
1 TRUE 27351
# All values between clean_debts in assessment_fin
# and debts_clean in df_clean match?
assessment_fin |> select(ResponseId, clean_debts) |> filter(!is.na(clean_debts)) |>
dplyr::left_join(
df_clean |> select(ResponseId, debts_clean),
by = "ResponseId"
) |>
dplyr::mutate(match = clean_debts == debts_clean) |> dplyr::group_by(match) |> dplyr::summarise(n = dplyr::n())# A tibble: 1 × 2
match n
<lgl> <int>
1 TRUE 12860
# Sanity check: Any duplicated ResponseId in assessment_fin?
assessment_fin |>
dplyr::count(ResponseId) |>
dplyr::filter(n > 1) |>
base::nrow()[1] 0
[1] 53825
df_clean <- df_clean |>
dplyr::left_join(
assessment_fin |> dplyr::select(
ResponseId,
classification_income,
change_income,
classification_assets,
change_assets,
classification_debts,
change_debts
),
by = c("ResponseId")
) |>
# Apply the changes recommended by collaborators
dplyr::mutate(
income_text_reviewed = dplyr::case_when(
!is.na(classification_income) &
stringr::str_detect(classification_income,
"Change to: \\[add value on column F\\]")
~ as.numeric(change_income),
TRUE ~ income_text_clean
),
assets_reviewed = dplyr::case_when(
!is.na(classification_assets) &
stringr::str_detect(classification_assets,
"Change to: \\[add value on column F\\]")
~ as.numeric(change_assets),
TRUE ~ assets_clean
),
debts_reviewed = dplyr::case_when(
!is.na(classification_debts) &
stringr::str_detect(classification_debts,
"Change to: \\[add value on column F\\]")
~ as.numeric(change_debts),
TRUE ~ debts_clean
)
)
nrow(df_clean)[1] 53825
# Sanity checks: View the counts of cells that were automatically approved
# and were still reviewed by collaborators
df_clean |> dplyr::select(ResponseId, country, income_text_clean,
classification_income, fin_valid_aut_income,
fin_valid_aut_income_update) |>
dplyr::filter(fin_valid_aut_income_update == 1 & !is.na(classification_income)
& income_text_clean > 0) |>
dplyr::group_by(country, classification_income) |>
dplyr::summarise(n = dplyr::n()) |>
base::nrow()[1] 0
df_clean |> dplyr::select(ResponseId, country, assets_clean,
classification_assets, fin_valid_aut_assets,
fin_valid_aut_assets_update) |>
dplyr::filter(fin_valid_aut_assets_update==1 & !is.na(classification_assets)
& assets_clean > 0) |>
dplyr::group_by(country, classification_assets) |>
dplyr::summarise(n = dplyr::n())# A tibble: 3 × 3
# Groups: country [2]
country classification_assets n
<chr> <chr> <int>
1 USA Cannot determine 2
2 USA OK 94
3 Zimbabwe OK 82
df_clean |> dplyr::select(ResponseId, country, debts_clean,
classification_debts, fin_valid_aut_debts,
fin_valid_aut_debts_update) |>
dplyr::filter(fin_valid_aut_debts_update== 1 & !is.na(classification_debts) &
debts_clean > 0) |>
dplyr::group_by(country, classification_debts) |>
dplyr::summarise(n = dplyr::n())# A tibble: 3 × 3
# Groups: country [2]
country classification_debts n
<chr> <chr> <int>
1 USA OK 14
2 Zimbabwe Change to: [add value on column F] 20
3 Zimbabwe OK 58
# Sanity checks: View the counts of cells that were automatically disapproved
# and were not reviewed by collaborators
df_clean |> dplyr::select(ResponseId, country, income_text_clean,
classification_income, fin_valid_aut_income,
fin_valid_aut_income_update) |>
dplyr::filter(fin_valid_aut_income_update == 0 & is.na(classification_income)
& income_text_clean > 0) |>
dplyr::group_by(country, classification_income) |>
dplyr::summarise(n = dplyr::n()) |>
base::nrow()[1] 0
df_clean |> dplyr::select(ResponseId, country, assets_clean,
classification_assets, fin_valid_aut_assets,
fin_valid_aut_assets_update) |>
dplyr::filter(fin_valid_aut_assets_update == 0 & is.na(classification_assets)
& assets_clean > 0) |>
dplyr::group_by(country, classification_assets) |>
dplyr::summarise(n = dplyr::n()) |>
base::nrow()[1] 0
df_clean |> dplyr::select(ResponseId, country, debts_clean, classification_debts,
fin_valid_aut_debts, fin_valid_aut_debts_update) |>
dplyr::filter(fin_valid_aut_debts_update == 0 & is.na(classification_debts)
& debts_clean > 0) |>
dplyr::group_by(country, classification_debts) |>
dplyr::summarise(n = dplyr::n()) |>
base::nrow()[1] 0
# Create variable where we fit the open field answers into the brackets
find_decile <- function(lang, income_val) {
# If value is missing, return NA for this row
if (is.na(income_val)) {
return(NA_real_)
}
# Subset brackets for language
brackets <- income_gaps[income_gaps$UserLanguage == lang, ]
# If no brackets available for this language, return NA
if (nrow(brackets) == 0) {
return(NA_real_)
}
for (j in seq_len(nrow(brackets))) {
low <- brackets$income_lowpoint_adj[j]
high <- brackets$income_highpoint_adj[j]
# Skip rows with missing low
if (is.na(low)) {
next
}
# Open-ended bracket: [low, ∞)
if (is.na(high)) {
if (income_val >= low) {
return(base::as.numeric(brackets$income_orig[j]))
} else {
next
}
}
# Interval [low, high] inclusive
if (income_val >= low && income_val <= high) {
return(base::as.numeric(brackets$income_orig[j]))
}
}
# If higher than all defined brackets, assign 9 by your current rule
9
}
df_clean <- df_clean |>
dplyr::mutate(
income_merg = dplyr::case_when(
is.na(income_orig) ~ NA_real_,
!is.na(income_orig) & income_orig != 10 ~ income_orig,
income_orig == 10 & is.na(income_text_reviewed) ~ NA_real_,
income_orig == 10 &
!is.na(income_text_reviewed) &
income_text_reviewed == 0 ~ 0,
TRUE ~ purrr::map2_dbl(
UserLanguage,
income_text_reviewed,
find_decile
)
),
income_merg_cat = base::factor(
dplyr::case_when(
income_merg == 0 ~ "No income",
income_merg == 1 ~ "Second decile",
income_merg == 2 ~ "Third decile",
income_merg == 3 ~ "Fourth decile",
income_merg == 4 ~ "Fifth decile",
income_merg == 5 ~ "Sixth decile",
income_merg == 6 ~ "Seventh decile",
income_merg == 7 ~ "Eighth decile",
income_merg == 8 ~ "Ninth decile",
income_merg == 9 ~ "Tenth decile",
TRUE ~ NA_character_
),
levels = c(
"No income",
"Second decile",
"Third decile",
"Fourth decile",
"Fifth decile",
"Sixth decile",
"Seventh decile",
"Eighth decile",
"Ninth decile",
"Tenth decile"
),
ordered = TRUE
),
income_merg_group = base::factor(
dplyr::case_when(
income_merg_cat == "No income"
~ "No income",
income_merg_cat %in% c("Second decile", "Third decile", "Fourth decile")
~ "Low",
income_merg_cat %in% c("Fifth decile", "Sixth decile")
~ "Mid",
income_merg_cat %in% c("Seventh decile", "Eighth decile", "Ninth decile")
~ "Upper",
income_merg_cat == "Tenth decile"
~ "Wealthiest",
TRUE ~ NA_character_
),
levels = c("No income", "Low", "Mid", "Upper", "Wealthiest"),
ordered = TRUE
)
)
df_clean <- df_clean |>
dplyr::left_join(
income_gaps |>
dplyr::select(
UserLanguage,
income_orig,
income_lowpoint_adj,
income_highpoint_adj
),
by = c("UserLanguage", "income_merg" = "income_orig")
) |>
dplyr::mutate(
income_merg_translated = dplyr::case_when(
is.na(income_merg) ~ NA_character_,
income_merg == 0 ~ "0",
# Closed interval [low, high]
!is.na(income_lowpoint_adj) &
!is.na(income_highpoint_adj)
~ paste0(
income_lowpoint_adj,
" - ",
income_highpoint_adj
),
# Open upper bound [low, ∞)
!is.na(income_lowpoint_adj) &
is.na(income_highpoint_adj)
~ paste0(
income_lowpoint_adj,
"+"
),
TRUE ~ NA_character_
)
)
# Sanity checks: View counts of merged income variable
df_clean |>
dplyr::filter(
income_orig == 10,
!is.na(income_text_reviewed)
) |>
dplyr::group_by(
UserLanguage,
income_merg,
income_merg_translated
) |>
dplyr::summarise(
min_income_text_reviewed = min(income_text_reviewed, na.rm = TRUE),
max_income_text_reviewed = max(income_text_reviewed, na.rm = TRUE)
) |>
print_reactable(sorted_col = "UserLanguage", width = 800)df_clean |>
dplyr::group_by(income_orig, income_merg) |>
dplyr::summarise(n = dplyr::n()) |>
base::print(n = Inf)# A tibble: 21 × 3
# Groups: income_orig [12]
income_orig income_merg n
<dbl> <dbl> <int>
1 0 0 1866
2 1 1 4775
3 2 2 5754
4 3 3 5961
5 4 4 5778
6 5 5 5195
7 6 6 4646
8 7 7 4583
9 8 8 3585
10 9 9 5347
11 10 0 91
12 10 1 1951
13 10 2 834
14 10 3 717
15 10 4 583
16 10 5 443
17 10 6 356
18 10 7 297
19 10 8 208
20 10 9 746
21 NA NA 109
Red flag exclusion
Each flag corresponds to a specific pattern that may indicate low-quality data. This process was only applied to participants who passed the direct exclusion criteria.
# Identified participants with IP addresses known to be associated with botnets.
botnet_ids <-
readr::read_csv("111_response_ids_botnets.csv", show_col_types = FALSE) |>
dplyr::pull(ResponseId) |>
base::trimws(); length(botnet_ids)[1] 262
# Identified participants with IP addresses massively repeated
# across multiple responses.
massive_rep_ids <-
readr::read_csv("111_ip_repeated.csv", show_col_types = FALSE) |>
dplyr::pull(ResponseId) |>
base::trimws(); length(massive_rep_ids)[1] 10300
# Start the flagging process.
df_flagged <- df_clean |>
dplyr::mutate(
# F1. Household >=4 and zero income
flag_hh4_zero_income =
dplyr::if_else(
irl == 0 &
household_size >= 4 &
income_merg == 0,
1,
0,
missing = NA_real_
),
# F2. Any financial items not valid
flag_fin_invalid =
dplyr::if_else(
irl == 0 &
(
(!is.na(classification_assets) &
!(classification_assets %in% c("OK", "Change to: [add value on column F]"))) |
(!is.na(classification_debts) &
!(classification_debts %in% c("OK", "Change to: [add value on column F]"))) |
(!is.na(classification_income) &
!(classification_income %in% c("OK", "Change to: [add value on column F]")))
),
1,
0,
missing = NA_real_
),
# F3. Low variance in MPWB, life satisfaction = 10,
# and no income or very low education
flag_ls10_noincome_var =
dplyr::if_else(
irl == 0 &
mpwb_var < 1 &
life_satisfaction == 10 &
(education_recoded == 1 |
income_merg == 0),
1,
0,
missing = NA_real_
),
# F4. Assets and debts are the same value (excluding both zero and NA)
flag_assets_debts_same =
dplyr::if_else(
irl == 0 &
!is.na(assets_clean) &
!is.na(debts_clean) &
assets_clean == debts_clean &
!(assets_clean == 0 & debts_clean == 0),
1,
0,
missing = NA_real_
),
# F5. Full-time student and lowest education level
# (Peru participants that selected inclusive education are exempt because
# they have NA in education_recoded)
flag_student_lowedu =
dplyr::if_else(
!is.na(education_recoded) &
education_recoded == 1 &
!is.na(employment_orig) &
stringr::str_detect(employment_orig, "\\b1\\b"),
1,
0,
missing = NA_real_
),
# F6. Zero variance in MPWB items
flag_mpwb_zerovar =
dplyr::if_else(
!is.na(mpwb_var) & mpwb_var == 0,
0.5,
0,
missing = NA_real_
),
# F7. Nonsensical sex or ethnicity
flag_nonsensical_sex_ethn =
dplyr::if_else(
irl == 0 &
(
(!is.na(sex_text_recoded) & sex_text_recoded == "Cannot determine") |
(!is.na(ethnicity_specify) & ethnicity_specify == "Cannot determine")
),
1,
0,
missing = NA_real_
),
# F8. High MPWB and high PHQ-4
flag_high_mpwb_phq4 =
dplyr::if_else(
irl == 0 &
!is.na(gad_worry) &
mpwb_sum >= 60 &
phq4_sum >= 20 &
duration_adj < 10,
1,
0,
missing = NA_real_
),
# F9. LS vs mean MPWB mismatch
flag_ls_vs_mpwb =
dplyr::case_when(
base::abs(life_satisfaction - mpwb_mean) > 5 ~ 2,
base::abs(life_satisfaction - mpwb_mean) > 4 ~ 1,
TRUE ~ 0
),
# F10. Age >75 and working or studying
flag_age75_workstudy =
dplyr::if_else(
age >= 75 &
!is.na(employment_orig) &
stringr::str_detect(employment_orig, "\\b(1|2|3|4|5)\\b"),
1,
0,
missing = NA_real_
),
# F11. Advanced education and <22 years
flag_young_advance =
dplyr::if_else(
!is.na(education_recoded) &
education_recoded == 5 &
age < 22,
1,
0,
missing = NA_real_
),
# F12. Short duration and low MPWB variance
flag_duration_var =
dplyr::if_else(
irl == 0 &
duration_adj < 10 &
mpwb_var < 1,
1,
0,
missing = NA_real_
),
# F13. Independent, <20, and richest income
flag_young_rich_alone =
dplyr::if_else(
household_size == 1 &
age < 20 &
income_merg >= 8,
1,
0,
missing = NA_real_
),
# F14. Retired and working at the same time
flag_retired_working =
dplyr::if_else(
irl == 0 &
!is.na(employment_orig) &
stringr::str_detect(employment_orig, "\\b6\\b") &
stringr::str_detect(employment_orig, "\\b(3|4|5)\\b"),
1,
0,
missing = NA_real_
),
# F15. Strange numbers in financial variables
# If collaborators already marked the value as not OK,
# then there is no need to repeat this flag.
flag_weird_nr =
dplyr::case_when(
irl == 0 &
(
(assets_wrd & classification_assets == "OK") |
(debts_wrd & classification_debts == "OK") |
(income_wrd & classification_income == "OK")
)
~ 1,
TRUE ~ 0
),
# F16. Botnet ResponseIds
flag_botnet =
dplyr::if_else(
ResponseId %in% botnet_ids,
1,
0,
missing = NA_real_
),
# F17. Massive repetition of IP + short duration + low variance
flag_rep =
dplyr::if_else(
ResponseId %in% massive_rep_ids &
duration_adj < 10 &
mpwb_var < 1,
1,
0,
missing = NA_real_
),
# Total flags
flag_total =
base::rowSums(
dplyr::across(dplyr::starts_with("flag_")),
na.rm = TRUE
),
# Exclusion flag
exclusion_flags =
dplyr::if_else(
flag_total > 4,
1,
0
),
# Update valid_status
valid_status = base::as.character(valid_status),
valid_status =
base::factor(
dplyr::case_when(
exclusion_flags == 1 ~ "flagged",
TRUE ~ valid_status
),
levels = c(
"flagged",
"passed"
)
)
)
# Sanity Check: View the counts of exclusion flags
table(df_flagged$valid_status, df_flagged$exclusion_flags, useNA = "always")
0 1 <NA>
flagged 0 26 0
passed 53799 0 0
<NA> 0 0 0
# Sanity Check: View the counts and percentages of each flag
df_flagged |>
dplyr::select(dplyr::starts_with("flag_")) |>
dplyr::summarise(
dplyr::across(
dplyr::everything(),
~ sum((!is.na(.) & . != 0))
)
) |>
tidyr::pivot_longer(
cols = dplyr::everything(),
names_to = "flag",
values_to = "n_flagged"
) |>
dplyr::mutate(
percent_flagged = 100 * n_flagged / nrow(df_flagged)
) |>
dplyr::arrange(dplyr::desc(percent_flagged))# A tibble: 18 × 3
flag n_flagged percent_flagged
<chr> <int> <dbl>
1 flag_total 16396 30.5
2 flag_duration_var 6935 12.9
3 flag_fin_invalid 4366 8.11
4 flag_ls_vs_mpwb 3146 5.84
5 flag_mpwb_zerovar 1804 3.35
6 flag_nonsensical_sex_ethn 1052 1.95
7 flag_assets_debts_same 703 1.31
8 flag_rep 621 1.15
9 flag_hh4_zero_income 493 0.916
10 flag_retired_working 337 0.626
11 flag_ls10_noincome_var 215 0.399
12 flag_botnet 202 0.375
13 flag_student_lowedu 162 0.301
14 flag_weird_nr 124 0.230
15 flag_age75_workstudy 70 0.130
16 flag_young_advance 50 0.0929
17 flag_high_mpwb_phq4 29 0.0539
18 flag_young_rich_alone 8 0.0149
# Sanity check: View combinations of classifications where flag_hh4_zero_income is raised
df_flagged |>
dplyr::filter(flag_hh4_zero_income == 1) |>
dplyr::group_by(household_size, income_merg, irl) |>
dplyr::summarise(n = dplyr::n())# A tibble: 15 × 4
# Groups: household_size, income_merg [15]
household_size income_merg irl n
<dbl> <dbl> <dbl> <int>
1 4 0 0 196
2 5 0 0 120
3 6 0 0 68
4 7 0 0 41
5 8 0 0 20
6 9 0 0 10
7 10 0 0 15
8 11 0 0 1
9 12 0 0 4
10 13 0 0 3
11 14 0 0 3
12 15 0 0 2
13 17 0 0 1
14 18 0 0 1
15 20 0 0 8
# Sanity check: View combinations of classifications where flag_fin_invalid is raised
df_flagged |>
dplyr::filter(flag_fin_invalid == 1) |>
dplyr::group_by(classification_income, classification_assets, classification_debts, irl) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "classification_income", width = 800)# Sanity check: View combinations of classifications where flag_ls10_noincome_var is raised
df_flagged |>
dplyr::filter(flag_ls10_noincome_var == 1) |>
dplyr::group_by(mpwb_var, life_satisfaction, education_recoded, income_merg, irl) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "mpwb_var", width = 800)# Sanity check: View combinations of classifications where flag_assets_debts_same is raised
df_flagged |>
dplyr::filter(flag_assets_debts_same == 1) |>
dplyr::group_by(assets_clean, debts_clean, irl) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "assets_clean", width = 500)# Sanity check: View combinations of classifications where flag_student_lowedu is raised
df_flagged |>
dplyr::filter(flag_student_lowedu == 1) |>
dplyr::group_by(education_recoded, employment_orig) |>
dplyr::summarise(n = dplyr::n())# A tibble: 8 × 3
# Groups: education_recoded [1]
education_recoded employment_orig n
<dbl> <chr> <int>
1 1 1 130
2 1 1,3 7
3 1 1,4 7
4 1 1,5 1
5 1 1,6 1
6 1 1,7 3
7 1 1,8 6
8 1 1,9 7
# Sanity check: View combinations of classifications where flag_mpwb_zerovar is raised
df_flagged |>
dplyr::filter(flag_mpwb_zerovar == 0.5) |>
dplyr::group_by(mpwb_var) |>
dplyr::summarise(n = dplyr::n())# A tibble: 1 × 2
mpwb_var n
<dbl> <int>
1 0 1804
# Sanity check: View combinations of classifications where flag_nonsensical_sex_ethn is raised
df_flagged |>
dplyr::filter(flag_nonsensical_sex_ethn == 1) |>
dplyr::group_by(sex_text_recoded, ethnicity_specify, irl) |>
dplyr::summarise(n = dplyr::n())# A tibble: 10 × 4
# Groups: sex_text_recoded, ethnicity_specify [10]
sex_text_recoded ethnicity_specify irl n
<chr> <chr> <dbl> <int>
1 Cannot determine Cannot determine 0 14
2 Cannot determine Cypriot 0 1
3 Cannot determine Other 0 5
4 Cannot determine Roma 0 1
5 Cannot determine The 4 Rs: (Nkole, Kiga, Batooro and Banyoro) 0 1
6 Cannot determine White 0 1
7 Cannot determine <NA> 0 81
8 Male Cannot determine 0 2
9 Non-binary Cannot determine 0 6
10 <NA> Cannot determine 0 940
# Sanity check: View combinations of classifications where flag_high_mpwb_phq4 is raised
df_flagged |>
dplyr::filter(flag_high_mpwb_phq4 == 1) |>
dplyr::group_by(mpwb_sum, phq4_sum, gad_worry, duration_adj, irl) |>
dplyr::summarise(n = dplyr::n()) |>
base::print(n = Inf)# A tibble: 29 × 6
# Groups: mpwb_sum, phq4_sum, gad_worry, duration_adj [29]
mpwb_sum phq4_sum gad_worry duration_adj irl n
<dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 60 20 5 7.66 0 1
2 60 22 5 6.41 0 1
3 60 25 6 9.03 0 1
4 60 28 7 7.66 0 1
5 61 22 4 9.59 0 1
6 61 28 7 7.10 0 1
7 62 20 4 7.8 0 1
8 62 22 7 8.83 0 1
9 62 25 5 9.66 0 1
10 62 25 7 6.72 0 1
11 62 26 6 7.79 0 1
12 62 28 7 9.39 0 1
13 62 28 7 9.93 0 1
14 63 20 6 9.17 0 1
15 63 21 4 5.55 0 1
16 63 24 5 6.76 0 1
17 63 24 6 6.59 0 1
18 63 24 6 8.2 0 1
19 63 24 6 9.10 0 1
20 63 25 7 5.90 0 1
21 63 28 7 7.03 0 1
22 64 22 7 8.23 0 1
23 65 20 5 9 0 1
24 65 21 6 5.37 0 1
25 66 21 6 7.83 0 1
26 66 22 6 9.38 0 1
27 70 20 5 6.90 0 1
28 70 20 7 7.93 0 1
29 70 22 4 6.6 0 1
# Sanity check: View combinations of classifications where flag_ls_vs_mpwb is raised
df_flagged |>
dplyr::filter(flag_ls_vs_mpwb >= 1) |>
dplyr::mutate(diff_ls_mpwb = base::abs(life_satisfaction - mpwb_mean)) |>
dplyr::group_by(life_satisfaction, mpwb_mean, diff_ls_mpwb, flag_ls_vs_mpwb, irl) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "mpwb_mean", width = 500)# Sanity check: View combinations of classifications where flag_age75_workstudy is raised
df_flagged |>
dplyr::filter(flag_age75_workstudy == 1) |>
dplyr::group_by(employment_cat, age) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "employment_cat", width = 800)# Sanity check: View combinations of classifications where flag_young_advance is raised
df_flagged |>
dplyr::filter(flag_young_advance == 1) |>
dplyr::group_by(education_recoded_cat, age) |>
dplyr::summarise(n = dplyr::n())# A tibble: 4 × 3
# Groups: education_recoded_cat [1]
education_recoded_cat age n
<ord> <dbl> <int>
1 Advanced 18 5
2 Advanced 19 13
3 Advanced 20 12
4 Advanced 21 20
# Sanity check: View combinations of classifications where flag_duration_var is raised
df_flagged |>
dplyr::filter(flag_duration_var == 1) |>
dplyr::mutate(min_duration_adj = min(duration_adj),
max_duration_adj = max(duration_adj),
min_mpwb_var = min(mpwb_var),
max_mpwb_var = max(mpwb_var)) |>
dplyr::group_by(min_duration_adj, max_duration_adj,
min_mpwb_var, max_mpwb_var, irl) |>
dplyr::summarise(n = dplyr::n())# A tibble: 1 × 6
# Groups: min_duration_adj, max_duration_adj, min_mpwb_var, max_mpwb_var [1]
min_duration_adj max_duration_adj min_mpwb_var max_mpwb_var irl n
<dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 5 9.97 0 0.989 0 6935
# Sanity check: View combinations of classifications where flag_young_rich_alone is raised
df_flagged |>
dplyr::filter(flag_young_rich_alone == 1) |>
dplyr::group_by(household_size, age, income_merg) |>
summarise(n = dplyr::n())# A tibble: 4 × 4
# Groups: household_size, age [2]
household_size age income_merg n
<dbl> <dbl> <dbl> <int>
1 1 18 8 2
2 1 18 9 1
3 1 19 8 3
4 1 19 9 2
# Sanity check: View combinations of classifications where flag_retired_working is raised
df_flagged |>
dplyr::filter(flag_retired_working == 1) |>
dplyr::group_by(employment_orig, age, irl) |>
summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "employment_orig", width = 500)# Sanity check: View combinations of classifications where flag_weird_nr is raised
df_flagged |>
dplyr::filter(flag_weird_nr == 1) |>
dplyr::group_by(assets_wrd, classification_assets,
debts_wrd, classification_debts,
income_wrd, classification_income, irl) |>
dplyr::summarise(n = dplyr::n()) |>
base::print(n = Inf)# A tibble: 25 × 8
# Groups: assets_wrd, classification_assets, debts_wrd, classification_debts, income_wrd, classification_income [25]
assets_wrd classification_assets debts_wrd classification_debts income_wrd classification_income irl n
<lgl> <chr> <lgl> <chr> <lgl> <chr> <dbl> <int>
1 FALSE OK FALSE <NA> TRUE OK 0 16
2 FALSE OK TRUE OK FALSE OK 0 2
3 FALSE OK TRUE OK FALSE Very unlikely to be true 0 2
4 FALSE OK TRUE OK FALSE <NA> 0 19
5 FALSE OK TRUE <NA> TRUE OK 0 1
6 FALSE Very unlikely to be true FALSE <NA> TRUE OK 0 1
7 FALSE Very unlikely to be true TRUE OK FALSE <NA> 0 2
8 FALSE <NA> FALSE OK TRUE OK 0 3
9 FALSE <NA> FALSE <NA> TRUE OK 0 6
10 FALSE <NA> TRUE OK FALSE OK 0 1
11 FALSE <NA> TRUE OK FALSE <NA> 0 13
12 TRUE Not possible/not believable TRUE Not possible/not believable TRUE OK 0 1
13 TRUE OK FALSE Not possible/not believable FALSE <NA> 0 3
14 TRUE OK FALSE OK FALSE Change to: [add value on column F] 0 1
15 TRUE OK FALSE OK FALSE OK 0 1
16 TRUE OK FALSE OK FALSE <NA> 0 6
17 TRUE OK FALSE Very unlikely to be true FALSE <NA> 0 7
18 TRUE OK FALSE <NA> FALSE Change to: [add value on column F] 0 1
19 TRUE OK FALSE <NA> FALSE OK 0 1
20 TRUE OK FALSE <NA> FALSE Very unlikely to be true 0 1
21 TRUE OK FALSE <NA> FALSE <NA> 0 31
22 TRUE OK TRUE Cannot determine FALSE <NA> 0 1
23 TRUE OK TRUE OK FALSE <NA> 0 1
24 TRUE OK TRUE Very unlikely to be true FALSE Very unlikely to be true 0 1
25 TRUE Very unlikely to be true TRUE OK FALSE <NA> 0 2
# Apply reviewed financial variables, unless all three flags F4, F15, and F2 are raised
df_flagged <- df_flagged |>
dplyr::mutate(
assets_reviewed = dplyr::case_when(
flag_assets_debts_same == 1 &
flag_weird_nr == 1 &
flag_fin_invalid == 1 ~ NA_real_,
!is.na(classification_assets) &
!classification_assets %in% c("OK", "Change to: [add value on column F]")
~ NA_real_,
TRUE ~ assets_reviewed
),
debts_reviewed = dplyr::case_when(
flag_assets_debts_same == 1 &
flag_weird_nr == 1 &
flag_fin_invalid == 1 ~ NA_real_,
!is.na(classification_debts) &
!classification_debts %in% c("OK", "Change to: [add value on column F]")
~ NA_real_,
TRUE ~ debts_reviewed
),
income_text_reviewed = dplyr::case_when(
flag_assets_debts_same == 1 &
flag_weird_nr == 1 &
flag_fin_invalid == 1 ~ NA_real_,
!is.na(classification_income) &
!classification_income %in% c("OK", "Change to: [add value on column F]")
~ NA_real_,
TRUE ~ income_text_reviewed
),
income_merg = dplyr::case_when(
flag_assets_debts_same == 1 &
flag_weird_nr == 1 &
flag_fin_invalid == 1 ~ NA_real_,
!is.na(classification_income) &
!classification_income %in% c("OK", "Change to: [add value on column F]")
~ NA_real_,
TRUE ~ income_merg
),
income_merg_translated = dplyr::case_when(
flag_assets_debts_same == 1 &
flag_weird_nr == 1 &
flag_fin_invalid == 1 ~ NA_character_,
!is.na(classification_income) &
!classification_income %in% c("OK", "Change to: [add value on column F]")
~ NA_character_,
TRUE ~ income_merg_translated
),
income_merg_group = dplyr::case_when(
flag_assets_debts_same == 1 &
flag_weird_nr == 1 &
flag_fin_invalid == 1 ~ NA_character_,
!is.na(classification_income) &
!classification_income %in% c("OK", "Change to: [add value on column F]")
~ NA_character_,
TRUE ~ income_merg_group
),
income_merg_cat = dplyr::case_when(
flag_assets_debts_same == 1 &
flag_weird_nr == 1 &
flag_fin_invalid == 1 ~ NA_character_,
!is.na(classification_income) &
!classification_income %in% c("OK", "Change to: [add value on column F]")
~ NA_character_,
TRUE ~ income_merg_cat
),
income_merg_cat = base::factor(
income_merg_cat,
levels = c(
"No income",
"Second decile",
"Third decile",
"Fourth decile",
"Fifth decile",
"Sixth decile",
"Seventh decile",
"Eighth decile",
"Ninth decile",
"Tenth decile"
),
ordered = TRUE
),
income_merg_group = base::factor(
income_merg_group,
levels = c(
"No income",
"Low",
"Mid",
"Upper",
"Wealthiest"
),
ordered = TRUE
)
)
# Sanity check:
df_flagged |>
dplyr::filter(is.na(income_text_reviewed) & !is.na(income_merg) & income_orig == 10) |>
base::nrow()[1] 0
df_flagged |>
dplyr::filter(
is.na(income_text_reviewed) &
income_orig ==10 &
classification_income %in% c("OK", "Change to: [remove value]")) |>
base::nrow()[1] 0
df_flagged |>
dplyr::filter(
is.na(income_text_reviewed) &
income_orig == 10 &
is.na(classification_income)) |>
base::nrow()[1] 0
Exclusion Summary
# Combine direct exclusions and flags
df_exclusion <- df_merged |>
dplyr::left_join(
df_flagged |>
dplyr::select(ResponseId, exclusion_flags),
by = "ResponseId"
) |>
dplyr::mutate(
exclusion_criteria = base::factor(
dplyr::case_when(
valid_status %in% c("incomplete","not residents") ~ valid_status,
valid_status %in% c(
"implausible working with no income",
"implausible retired young",
"implausible extremes",
"implausible high scores with speed"
) ~ "implausible",
valid_status %in% c("too fast general","too fast low var") ~ "too fast",
valid_status == "passed" &
!is.na(exclusion_flags) & exclusion_flags == 1 ~ "flagged",
valid_status == "passed" &
(is.na(exclusion_flags) | exclusion_flags == 0) ~ "valid",
TRUE ~ NA_character_
),
levels = c(
"valid",
"incomplete",
"too fast",
"not residents",
"implausible",
"flagged"
)
)
)
# Country-level summary
summary_table <- df_exclusion |>
dplyr::group_by(country) |>
dplyr::summarise(
initial_number_of_participants = dplyr::n(),
valid_participants = base::sum(exclusion_criteria == "valid", na.rm = TRUE),
incomplete = base::sum(exclusion_criteria == "incomplete", na.rm = TRUE),
too_fast = base::sum(exclusion_criteria == "too fast", na.rm = TRUE),
not_residents = base::sum(exclusion_criteria == "not residents", na.rm = TRUE),
implausible = base::sum(exclusion_criteria == "implausible", na.rm = TRUE),
flagged = base::sum(exclusion_criteria == "flagged", na.rm = TRUE)
) |>
dplyr::mutate(
total_exclusions =
initial_number_of_participants - valid_participants,
total_pct_lost =
(total_exclusions / initial_number_of_participants) * 100
)
# Total row
total_row <- summary_table |>
dplyr::summarise(
country = "Total",
initial_number_of_participants =
base::sum(initial_number_of_participants),
valid_participants = base::sum(valid_participants),
incomplete = base::sum(incomplete),
too_fast = base::sum(too_fast),
not_residents = base::sum(not_residents),
implausible = base::sum(implausible),
flagged = base::sum(flagged),
total_exclusions = base::sum(total_exclusions),
total_pct_lost =
(total_exclusions / initial_number_of_participants) * 100
)
summary_table_pct <- dplyr::bind_rows(summary_table, total_row) |>
dplyr::mutate(
incomplete = paste0(
incomplete,
" (",
format(round((incomplete / initial_number_of_participants) * 100, 2), nsmall = 2),
"%)"
),
too_fast = paste0(
too_fast,
" (",
format(round((too_fast / initial_number_of_participants) * 100, 2), nsmall = 2),
"%)"
),
not_residents = paste0(
not_residents,
" (",
format(round((not_residents / initial_number_of_participants) * 100, 2), nsmall = 2),
"%)"
),
implausible = paste0(
implausible,
" (",
format(round((implausible / initial_number_of_participants) * 100, 2), nsmall = 2),
"%)"
),
flagged = paste0(
flagged,
" (",
format(round((flagged / initial_number_of_participants) * 100, 2), nsmall = 2),
"%)"
),
total_exclusions = paste0(
total_exclusions,
" (",
format(round(total_pct_lost, 2), nsmall = 1),
"%)"
)
) |>
dplyr::select(-total_pct_lost)
# gt
summary_table_pdf <- summary_table_pct |>
dplyr::rename(
Country = country,
"Initial number of participants" = initial_number_of_participants,
"Valid participants" = valid_participants,
"Incomplete" = incomplete,
"Too fast" = too_fast,
"Not residents" = not_residents,
"Implausible combinations" = implausible,
"Flagged" = flagged,
"Total exclusions" = total_exclusions
)
gt_table <- summary_table_pdf |>
gt::gt() |>
gt::cols_width(
Country ~ gt::px(65),
`Initial number of participants` ~ gt::px(70),
`Valid participants` ~ gt::px(70),
Incomplete ~ gt::px(70),
`Too fast` ~ gt::px(70),
`Not residents` ~ gt::px(70),
`Implausible combinations` ~ gt::px(70),
`Flagged` ~ gt::px(70),
`Total exclusions` ~ gt::px(105)
) |>
gt::tab_options(
table.font.size = 10,
column_labels.font.size = 11,
table.background.color = "white",
table.align = "center",
table.width = gt::px(650),
table.border.top.color = "white",
table.border.bottom.color = "white",
table.border.left.color = "white",
table.border.right.color = "white",
table_body.hlines.color = "black",
table_body.vlines.color = "white",
column_labels.vlines.color = "white",
column_labels.border.top.color = "white",
column_labels.border.bottom.color = "black"
) |>
gt::opt_table_lines() |>
gt::tab_style(
style = list(
gt::cell_text(
weight = "bold",
align = "center"
)
),
locations = gt::cells_column_labels(gt::everything())
) |>
gt::tab_style(
style = gt::cell_text(
align = "left"
),
locations = gt::cells_body(columns = Country)
) |>
gt::tab_style(
style = gt::cell_text(
align = "center"
),
locations = gt::cells_body(
columns = c(
`Initial number of participants`,
`Valid participants`,
Incomplete,
`Too fast`,
`Not residents`,
`Implausible combinations`,
`Flagged`,
`Total exclusions`
)
)
); gt_table| Country | Initial number of participants | Valid participants | Incomplete | Too fast | Not residents | Implausible combinations | Flagged | Total exclusions |
|---|---|---|---|---|---|---|---|---|
| Albania | 2284 | 1758 | 487 (21.32%) | 4 ( 0.18%) | 34 (1.49%) | 1 (0.04%) | 0 (0.00%) | 526 (23.03%) |
| Algeria | 203 | 149 | 53 (26.11%) | 1 ( 0.49%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 54 (26.60%) |
| Angola | 329 | 240 | 68 (20.67%) | 15 ( 4.56%) | 1 (0.30%) | 5 (1.52%) | 0 (0.00%) | 89 (27.05%) |
| Argentina | 769 | 634 | 119 (15.47%) | 3 ( 0.39%) | 11 (1.43%) | 2 (0.26%) | 0 (0.00%) | 135 (17.56%) |
| Armenia | 334 | 246 | 83 (24.85%) | 1 ( 0.30%) | 1 (0.30%) | 3 (0.90%) | 0 (0.00%) | 88 (26.35%) |
| Australia | 605 | 500 | 67 (11.07%) | 25 ( 4.13%) | 7 (1.16%) | 5 (0.83%) | 1 (0.17%) | 105 (17.36%) |
| Austria | 685 | 570 | 106 (15.47%) | 8 ( 1.17%) | 1 (0.15%) | 0 (0.00%) | 0 (0.00%) | 115 (16.79%) |
| Bahrain | 211 | 161 | 48 (22.75%) | 1 ( 0.47%) | 1 (0.47%) | 0 (0.00%) | 0 (0.00%) | 50 (23.70%) |
| Bangladesh | 536 | 335 | 186 (34.70%) | 0 ( 0.00%) | 0 (0.00%) | 14 (2.61%) | 1 (0.19%) | 201 (37.50%) |
| Belgium | 331 | 272 | 42 (12.69%) | 14 ( 4.23%) | 2 (0.60%) | 1 (0.30%) | 0 (0.00%) | 59 (17.82%) |
| Bolivia | 341 | 279 | 59 (17.30%) | 0 ( 0.00%) | 2 (0.59%) | 1 (0.29%) | 0 (0.00%) | 62 (18.18%) |
| Bosnia-Herzegovina | 642 | 486 | 144 (22.43%) | 11 ( 1.71%) | 1 (0.16%) | 0 (0.00%) | 0 (0.00%) | 156 (24.30%) |
| Brazil | 2094 | 1809 | 241 (11.51%) | 30 ( 1.43%) | 1 (0.05%) | 13 (0.62%) | 0 (0.00%) | 285 (13.61%) |
| Bulgaria | 393 | 324 | 67 (17.05%) | 1 ( 0.25%) | 0 (0.00%) | 1 (0.25%) | 0 (0.00%) | 69 (17.56%) |
| Canada | 874 | 707 | 148 (16.93%) | 16 ( 1.83%) | 1 (0.11%) | 2 (0.23%) | 0 (0.00%) | 167 (19.11%) |
| Chad | 192 | 115 | 73 (38.02%) | 0 ( 0.00%) | 2 (1.04%) | 2 (1.04%) | 0 (0.00%) | 77 (40.10%) |
| Chile | 240 | 207 | 30 (12.50%) | 3 ( 1.25%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 33 (13.75%) |
| China | 2523 | 1018 | 215 ( 8.52%) | 1277 (50.61%) | 0 (0.00%) | 13 (0.52%) | 0 (0.00%) | 1505 (59.65%) |
| Croatia | 455 | 349 | 99 (21.76%) | 7 ( 1.54%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 106 (23.30%) |
| Cyprus | 218 | 173 | 41 (18.81%) | 0 ( 0.00%) | 0 (0.00%) | 4 (1.83%) | 0 (0.00%) | 45 (20.64%) |
| Czech Republic | 267 | 202 | 63 (23.60%) | 1 ( 0.37%) | 1 (0.37%) | 0 (0.00%) | 0 (0.00%) | 65 (24.34%) |
| Denmark | 338 | 283 | 47 (13.91%) | 7 ( 2.07%) | 0 (0.00%) | 1 (0.30%) | 0 (0.00%) | 55 (16.27%) |
| Ecuador | 1075 | 954 | 94 ( 8.74%) | 7 ( 0.65%) | 11 (1.02%) | 9 (0.84%) | 0 (0.00%) | 121 (11.26%) |
| Egypt | 869 | 630 | 221 (25.43%) | 5 ( 0.58%) | 9 (1.04%) | 4 (0.46%) | 0 (0.00%) | 239 (27.50%) |
| Estonia | 2402 | 1903 | 480 (19.98%) | 4 ( 0.17%) | 3 (0.12%) | 12 (0.50%) | 0 (0.00%) | 499 (20.77%) |
| Ethiopia | 552 | 403 | 141 (25.54%) | 1 ( 0.18%) | 4 (0.72%) | 3 (0.54%) | 0 (0.00%) | 149 (26.99%) |
| Finland | 275 | 241 | 24 ( 8.73%) | 7 ( 2.55%) | 1 (0.36%) | 2 (0.73%) | 0 (0.00%) | 34 (12.36%) |
| France | 1175 | 908 | 153 (13.02%) | 90 ( 7.66%) | 2 (0.17%) | 21 (1.79%) | 1 (0.09%) | 267 (22.72%) |
| Georgia | 504 | 371 | 126 (25.00%) | 6 ( 1.19%) | 1 (0.20%) | 0 (0.00%) | 0 (0.00%) | 133 (26.39%) |
| Germany | 1008 | 824 | 156 (15.48%) | 21 ( 2.08%) | 1 (0.10%) | 5 (0.50%) | 1 (0.10%) | 184 (18.25%) |
| Greece | 532 | 444 | 81 (15.23%) | 3 ( 0.56%) | 2 (0.38%) | 2 (0.38%) | 0 (0.00%) | 88 (16.54%) |
| Hong Kong | 237 | 176 | 45 (18.99%) | 6 ( 2.53%) | 10 (4.22%) | 0 (0.00%) | 0 (0.00%) | 61 (25.74%) |
| Hungary | 735 | 555 | 169 (22.99%) | 5 ( 0.68%) | 3 (0.41%) | 3 (0.41%) | 0 (0.00%) | 180 (24.49%) |
| India | 1627 | 1225 | 315 (19.36%) | 26 ( 1.60%) | 21 (1.29%) | 34 (2.09%) | 6 (0.37%) | 402 (24.71%) |
| Indonesia | 1501 | 1223 | 250 (16.66%) | 13 ( 0.87%) | 3 (0.20%) | 8 (0.53%) | 4 (0.27%) | 278 (18.52%) |
| Iran | 292 | 216 | 75 (25.68%) | 0 ( 0.00%) | 0 (0.00%) | 1 (0.34%) | 0 (0.00%) | 76 (26.03%) |
| Ireland | 1661 | 1526 | 110 ( 6.62%) | 10 ( 0.60%) | 12 (0.72%) | 3 (0.18%) | 0 (0.00%) | 135 ( 8.13%) |
| Israel | 437 | 353 | 75 (17.16%) | 8 ( 1.83%) | 0 (0.00%) | 1 (0.23%) | 0 (0.00%) | 84 (19.22%) |
| Italy | 566 | 489 | 66 (11.66%) | 7 ( 1.24%) | 1 (0.18%) | 3 (0.53%) | 0 (0.00%) | 77 (13.60%) |
| Japan | 549 | 431 | 76 (13.84%) | 36 ( 6.56%) | 0 (0.00%) | 5 (0.91%) | 1 (0.18%) | 118 (21.49%) |
| Kazakhstan | 787 | 676 | 91 (11.56%) | 11 ( 1.40%) | 8 (1.02%) | 1 (0.13%) | 0 (0.00%) | 111 (14.10%) |
| Kosovo | 1373 | 994 | 359 (26.15%) | 4 ( 0.29%) | 12 (0.87%) | 4 (0.29%) | 0 (0.00%) | 379 (27.60%) |
| Kuwait | 315 | 241 | 69 (21.90%) | 2 ( 0.63%) | 2 (0.63%) | 1 (0.32%) | 0 (0.00%) | 74 (23.49%) |
| Kyrgyzstan | 375 | 274 | 74 (19.73%) | 22 ( 5.87%) | 1 (0.27%) | 4 (1.07%) | 0 (0.00%) | 101 (26.93%) |
| Latvia | 1023 | 806 | 206 (20.14%) | 6 ( 0.59%) | 2 (0.20%) | 3 (0.29%) | 0 (0.00%) | 217 (21.21%) |
| Lebanon | 416 | 322 | 84 (20.19%) | 0 ( 0.00%) | 3 (0.72%) | 7 (1.68%) | 0 (0.00%) | 94 (22.60%) |
| Madagascar | 169 | 145 | 22 (13.02%) | 0 ( 0.00%) | 0 (0.00%) | 2 (1.18%) | 0 (0.00%) | 24 (14.20%) |
| Malaysia | 816 | 706 | 99 (12.13%) | 2 ( 0.25%) | 1 (0.12%) | 7 (0.86%) | 1 (0.12%) | 110 (13.48%) |
| Mexico | 1164 | 1062 | 84 ( 7.22%) | 10 ( 0.86%) | 1 (0.09%) | 6 (0.52%) | 1 (0.09%) | 102 ( 8.76%) |
| Moldova | 511 | 398 | 100 (19.57%) | 3 ( 0.59%) | 4 (0.78%) | 5 (0.98%) | 1 (0.20%) | 113 (22.11%) |
| Mongolia | 367 | 261 | 100 (27.25%) | 0 ( 0.00%) | 6 (1.63%) | 0 (0.00%) | 0 (0.00%) | 106 (28.88%) |
| Montenegro | 358 | 301 | 45 (12.57%) | 4 ( 1.12%) | 7 (1.96%) | 1 (0.28%) | 0 (0.00%) | 57 (15.92%) |
| Morocco | 302 | 231 | 61 (20.20%) | 3 ( 0.99%) | 1 (0.33%) | 6 (1.99%) | 0 (0.00%) | 71 (23.51%) |
| Mozambique | 154 | 122 | 31 (20.13%) | 0 ( 0.00%) | 0 (0.00%) | 1 (0.65%) | 0 (0.00%) | 32 (20.78%) |
| Netherlands | 448 | 353 | 79 (17.63%) | 14 ( 3.12%) | 1 (0.22%) | 1 (0.22%) | 0 (0.00%) | 95 (21.21%) |
| Nigeria | 721 | 636 | 75 (10.40%) | 0 ( 0.00%) | 8 (1.11%) | 2 (0.28%) | 0 (0.00%) | 85 (11.79%) |
| North Macedonia | 268 | 230 | 37 (13.81%) | 0 ( 0.00%) | 1 (0.37%) | 0 (0.00%) | 0 (0.00%) | 38 (14.18%) |
| Norway | 509 | 408 | 90 (17.68%) | 11 ( 2.16%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 101 (19.84%) |
| Oman | 520 | 413 | 100 (19.23%) | 1 ( 0.19%) | 3 (0.58%) | 2 (0.38%) | 1 (0.19%) | 107 (20.58%) |
| Pakistan | 507 | 401 | 94 (18.54%) | 2 ( 0.39%) | 1 (0.20%) | 9 (1.78%) | 0 (0.00%) | 106 (20.91%) |
| Paraguay | 205 | 162 | 42 (20.49%) | 1 ( 0.49%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 43 (20.98%) |
| Peru | 1031 | 868 | 87 ( 8.44%) | 48 ( 4.66%) | 4 (0.39%) | 23 (2.23%) | 1 (0.10%) | 163 (15.81%) |
| Philippines | 3556 | 2636 | 769 (21.63%) | 17 ( 0.48%) | 120 (3.37%) | 12 (0.34%) | 2 (0.06%) | 920 (25.87%) |
| Poland | 1288 | 1024 | 250 (19.41%) | 8 ( 0.62%) | 3 (0.23%) | 3 (0.23%) | 0 (0.00%) | 264 (20.50%) |
| Portugal | 579 | 451 | 118 (20.38%) | 7 ( 1.21%) | 0 (0.00%) | 3 (0.52%) | 0 (0.00%) | 128 (22.11%) |
| Qatar | 526 | 397 | 113 (21.48%) | 5 ( 0.95%) | 11 (2.09%) | 0 (0.00%) | 0 (0.00%) | 129 (24.52%) |
| Republic of Korea | 492 | 425 | 27 ( 5.49%) | 34 ( 6.91%) | 1 (0.20%) | 5 (1.02%) | 0 (0.00%) | 67 (13.62%) |
| Romania | 861 | 676 | 174 (20.21%) | 10 ( 1.16%) | 0 (0.00%) | 1 (0.12%) | 0 (0.00%) | 185 (21.49%) |
| Russia | 1322 | 1168 | 73 ( 5.52%) | 51 ( 3.86%) | 17 (1.29%) | 12 (0.91%) | 1 (0.08%) | 154 (11.65%) |
| Saudi Arabia | 296 | 260 | 27 ( 9.12%) | 6 ( 2.03%) | 1 (0.34%) | 2 (0.68%) | 0 (0.00%) | 36 (12.16%) |
| Senegal | 211 | 142 | 66 (31.28%) | 0 ( 0.00%) | 1 (0.47%) | 2 (0.95%) | 0 (0.00%) | 69 (32.70%) |
| Serbia | 420 | 324 | 89 (21.19%) | 4 ( 0.95%) | 3 (0.71%) | 0 (0.00%) | 0 (0.00%) | 96 (22.86%) |
| Singapore | 298 | 239 | 19 ( 6.38%) | 39 (13.09%) | 0 (0.00%) | 1 (0.34%) | 0 (0.00%) | 59 (19.80%) |
| Slovakia | 724 | 517 | 196 (27.07%) | 3 ( 0.41%) | 5 (0.69%) | 3 (0.41%) | 0 (0.00%) | 207 (28.59%) |
| Slovenia | 746 | 584 | 154 (20.64%) | 5 ( 0.67%) | 1 (0.13%) | 2 (0.27%) | 0 (0.00%) | 162 (21.72%) |
| South Africa | 279 | 233 | 45 (16.13%) | 0 ( 0.00%) | 0 (0.00%) | 1 (0.36%) | 0 (0.00%) | 46 (16.49%) |
| Spain | 729 | 614 | 104 (14.27%) | 6 ( 0.82%) | 3 (0.41%) | 2 (0.27%) | 0 (0.00%) | 115 (15.78%) |
| Sweden | 1149 | 824 | 266 (23.15%) | 53 ( 4.61%) | 0 (0.00%) | 6 (0.52%) | 0 (0.00%) | 325 (28.29%) |
| Switzerland | 823 | 668 | 139 (16.89%) | 14 ( 1.70%) | 0 (0.00%) | 2 (0.24%) | 0 (0.00%) | 155 (18.83%) |
| Taiwan | 201 | 146 | 36 (17.91%) | 17 ( 8.46%) | 0 (0.00%) | 2 (1.00%) | 0 (0.00%) | 55 (27.36%) |
| Thailand | 440 | 375 | 59 (13.41%) | 4 ( 0.91%) | 0 (0.00%) | 2 (0.45%) | 0 (0.00%) | 65 (14.77%) |
| Timor-Leste | 277 | 144 | 129 (46.57%) | 0 ( 0.00%) | 1 (0.36%) | 3 (1.08%) | 0 (0.00%) | 133 (48.01%) |
| Türkiye | 682 | 487 | 171 (25.07%) | 11 ( 1.61%) | 12 (1.76%) | 1 (0.15%) | 0 (0.00%) | 195 (28.59%) |
| UAE | 336 | 228 | 86 (25.60%) | 6 ( 1.79%) | 13 (3.87%) | 3 (0.89%) | 0 (0.00%) | 108 (32.14%) |
| UK | 852 | 671 | 147 (17.25%) | 26 ( 3.05%) | 5 (0.59%) | 3 (0.35%) | 0 (0.00%) | 181 (21.24%) |
| USA | 5708 | 4242 | 1002 (17.55%) | 170 ( 2.98%) | 280 (4.91%) | 11 (0.19%) | 3 (0.05%) | 1466 (25.68%) |
| Uganda | 332 | 242 | 85 (25.60%) | 0 ( 0.00%) | 1 (0.30%) | 4 (1.20%) | 0 (0.00%) | 90 (27.11%) |
| Ukraine | 749 | 654 | 88 (11.75%) | 3 ( 0.40%) | 3 (0.40%) | 1 (0.13%) | 0 (0.00%) | 95 (12.68%) |
| Uruguay | 815 | 566 | 246 (30.18%) | 1 ( 0.12%) | 2 (0.25%) | 0 (0.00%) | 0 (0.00%) | 249 (30.55%) |
| Uzbekistan | 662 | 556 | 102 (15.41%) | 4 ( 0.60%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 106 (16.01%) |
| Yemen | 580 | 397 | 170 (29.31%) | 1 ( 0.17%) | 5 (0.86%) | 7 (1.21%) | 0 (0.00%) | 183 (31.55%) |
| Zimbabwe | 275 | 210 | 59 (21.45%) | 3 ( 1.09%) | 1 (0.36%) | 2 (0.73%) | 0 (0.00%) | 65 (23.64%) |
| Total | 69408 | 53799 | 12181 (17.55%) | 2332 ( 3.36%) | 705 (1.02%) | 365 (0.53%) | 26 (0.04%) | 15609 (22.49%) |
gt::gtsave(gt_table, "222_exclusion_table.html")
gt::gtsave(gt_table, "222_exclusion_table.docx")
pagedown::chrome_print(
"222_exclusion_table.html",
output = "222_exclusion_table.pdf"
)
# Country-level inclusion and sample sizes
country_inclusion <- df_exclusion |>
dplyr::group_by(country) |>
dplyr::summarise(
initial_number_of_participants = dplyr::n(),
valid_participants = sum(exclusion_criteria == "valid", na.rm = TRUE),
inclusion_rate = format(round(100 * valid_participants /
initial_number_of_participants, 2), nsmall = 2)
)
# Country with minimum and maximum inclusion rates
country_inclusion |>
dplyr::slice_min(inclusion_rate, n = 1, with_ties = FALSE)# A tibble: 1 × 4
country initial_number_of_participants valid_participants inclusion_rate
<chr> <int> <int> <chr>
1 China 2523 1018 40.35
# A tibble: 1 × 4
country initial_number_of_participants valid_participants inclusion_rate
<chr> <int> <int> <chr>
1 Ireland 1661 1526 91.87
# Countries with smallest and largest valid sample sizes
country_inclusion |>
dplyr::slice_min(valid_participants, n = 1, with_ties = FALSE)# A tibble: 1 × 4
country initial_number_of_participants valid_participants inclusion_rate
<chr> <int> <int> <chr>
1 Chad 192 115 59.90
# A tibble: 1 × 4
country initial_number_of_participants valid_participants inclusion_rate
<chr> <int> <int> <chr>
1 USA 5708 4242 74.32
Missing Data
# Focus on the original variables
orig_cols <- base::intersect(names(df_final), names(df_pub))
df_final_orig <- df_final |>
dplyr::select(all_of(orig_cols))
visdat::vis_miss(df_final_orig, cluster = TRUE, warn_large_data = FALSE)df_final_orig |>
dplyr::summarise(
dplyr::across(
dplyr::everything(),
\(x) 100 * mean(is.na(x), na.rm = TRUE)
),
.groups = "drop"
) |>
tidyr::pivot_longer(
cols = dplyr::everything(),
names_to = "col",
values_to = "pct_missing"
) |>
print_reactable(sorted_col = "pct_missing", width = 600)A0.3. Harmonize financial variables
Midpoint of the brackets
The midpoints of the brackets were computed as (low_bracket + high_bracket)/2 except the last bracket needs to be computed differently because it is open-ended. We computed the median ratio between the midpoints of the last bracket and the low point of the last bracket across countries where the low point of the last bracket was lower than the maximum open text answer provided by participants. This median ratio was then used to compute the midpoint of the last bracket in countries where the low point of the last bracket was higher than the maximum open text answer provided by participants.
# Calculate midpoints for all brackets except the last one
income_brackets <- income_recoded |>
dplyr::mutate(
income_midpoint = dplyr::case_when(
income_orig == 9 ~ NA_real_,
TRUE ~ (income_lowpoint + income_highpoint) / 2
)
) |>
dplyr::select(UserLanguage, income_orig, income_midpoint) |>
dplyr::glimpse(width = 100)Rows: 1,125
Columns: 3
$ UserLanguage <chr> "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB…
$ income_orig <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7,…
$ income_midpoint <dbl> 60000.0, 160000.0, 250000.0, 355000.0, 515000.0, 725000.0, 1040000.0, 1625…
[1] 53799
df_final <- df_final |>
dplyr::left_join(income_brackets, by = c("UserLanguage", "income_orig"))
nrow(df_final)[1] 53799
# Calculate midpoints for the last bracket
midpoints_last <- df_final |>
# There are no income values from the open text field in Taiwan
dplyr::filter(country != "Taiwan") |>
dplyr::group_by(country) |>
dplyr::summarise(
# Ireland have different lowpoints for the last bracket
# in the main dataset and the sponsored dataset.
max_income_lowpoint = base::max(income_lowpoint_9, na.rm = TRUE),
max_income_text = base::max(income_text_reviewed, na.rm = TRUE),
income_midpoint_last =
base::mean(c(max_income_lowpoint, max_income_text), na.rm = TRUE),
bracket_higher_than_text =
!is.na(max_income_text) & max_income_lowpoint > max_income_text,
ratio = income_midpoint_last/max_income_lowpoint
) |>
dplyr::glimpse(width = 100)Rows: 91
Columns: 6
$ country <chr> "Albania", "Algeria", "Angola", "Argentina", "Armenia", "Australi…
$ max_income_lowpoint <dbl> 2000000, 200000, 1450000001, 2400000, 1200001, 250000, 93163, 230…
$ max_income_text <dbl> 2000000, 400000, 200000000, 40000000, 4000000, 290000, 200000, 55…
$ income_midpoint_last <dbl> 2000000.0, 300000.0, 825000000.5, 21200000.0, 2600000.5, 270000.0…
$ bracket_higher_than_text <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
$ ratio <dbl> 1.0000000, 1.5000000, 0.5689655, 8.8333333, 2.1666653, 1.0800000,…
ratio_last_bracket <- midpoints_last |>
dplyr::filter(!bracket_higher_than_text) |>
dplyr::summarise(median_ratio = stats::median(ratio, na.rm = TRUE)) |>
dplyr::pull(median_ratio); ratio_last_bracket[1] 2
[1] 53799
df_final <- df_final |>
dplyr::left_join(midpoints_last |> dplyr::select(country, bracket_higher_than_text, max_income_lowpoint, income_midpoint_last), by = "country")
nrow(df_final)[1] 53799
# Update income_midpoint for the last bracket
df_final <-
df_final |>
dplyr::mutate(
income_midpoint = dplyr::case_when(
income_orig == 9 & !bracket_higher_than_text ~ income_midpoint_last,
income_orig == 9 & bracket_higher_than_text ~ max_income_lowpoint * ratio_last_bracket,
income_orig == 9 & country == "Taiwan"
~ income_lowpoint_9 * ratio_last_bracket,
TRUE ~ income_midpoint
)
)
# Sanity check: View last midpoints
df_final |>
dplyr::filter(income_orig == 9) |>
dplyr::group_by(country, income_orig, income_lowpoint_9, income_midpoint) |>
dplyr::summarise() |>
dplyr::arrange(country) |>
print_reactable(sorted_col = "income_midpoint", width = 800)Convert monthly values to annual
We created a variable that combined midpoints from brackets with the open field answers and converted monthly values to annual values.
The survey versions for Bahrain and Pakistan requested for annual income in their native language, but monthly in the English version. If the distribution of responses is similar, we will retain the transformed monthly values.
annual monthly <NA>
28802 24997 0
annual monthly <NA>
12 0 21935 0
12.5 0 241 0
13 0 2821 0
<NA> 28802 0 0
df_final <- df_final |>
dplyr::mutate(
income_cont = dplyr::case_when(
income_orig == 0 ~ 0,
income_orig == 10 ~ income_text_reviewed,
!is.na(income_midpoint) ~ income_midpoint,
TRUE ~ NA_real_
),
income_cont_nozero = dplyr::case_when(
income_orig == 10 & income_text_reviewed > 0 ~ income_text_reviewed,
!is.na(income_midpoint) ~ income_midpoint,
TRUE ~ NA_real_
),
income_annual = dplyr::case_when(
income_period == "monthly" ~ income_cont_nozero * wages_per_year,
TRUE ~ income_cont_nozero
)
)
df_final |> dplyr::group_by(country, income_orig, income_merg, income_text_reviewed,
income_merg_translated, income_cont, income_midpoint,
income_cont_nozero, income_annual, income_period) |>
dplyr::summarise(n = dplyr::n()) |>
dplyr::arrange(country, income_orig) |>
print_reactable(sorted_col = "country", width = 900)Convert 2025 income values to 2024
The inflation2024_factor was calculated with Consumer Price Index (CPI): CPI 2025 / CPI 2024
If national reports only provided the percentage of inflation change, then the factor was calculated as 1 + (percentage inflation change / 100). Then, to convert 2025 income to 2024 values we divided 2025 income by this factor.
2024 2025 <NA>
50097 3702 0
0.966 1.001 1.0021 1.0182 1.0399 1.0415 1.045 1.046 1.073 1.087 1.118 <NA>
115 293 241 500 371 122 230 324 274 556 676 50097
Income net and gross
Collaborators from Kuwait, Oman, and Saudi Arabia confirmed that the income values do not require transformation because there is no income tax in these countries.
The survey version for Belgium requested net income in the Dutch version, and gross income in the French version.
Since the calculation of social contribution and tax deduction is not the same for all countries, we will remove the countries that asked for net income.
We will not apply tax brackets for Zimbabwe, since the system changes throughout the year due to inflation.
Converting all financial values to USD
df_final <- df_final |>
dplyr::mutate(
# We have already set values of 0 as NA earlier
income_annual_24_gross_USD = income_annual_24_gross *
one_local_unit_to_USD_conversion,
# If assets or debts are 0, set to NA
assets_USD = base::ifelse(
is.na(assets_reviewed) | assets_reviewed == 0,
NA_real_,
assets_reviewed * one_local_unit_to_USD_conversion
),
debts_USD = base::ifelse(
is.na(debts_reviewed) | debts_reviewed == 0,
NA_real_,
debts_reviewed * one_local_unit_to_USD_conversion
)
)Calculating z-scores and percentiles for assets and debts
# Sanity check: View means and sds
df_final |>
dplyr::group_by(country, income_type) |>
dplyr::summarise(
n_income = base::round(base::sum(!is.na(income_annual_24_gross_USD)), 2),
mean_income = base::round(base::mean(income_annual_24_gross_USD, na.rm = TRUE), 2),
sd_income = base::round(stats::sd(income_annual_24_gross_USD, na.rm = TRUE), 2),
n_assets = base::round(base::sum(!is.na(assets_USD)), 2),
mean_assets = base::round(base::mean(assets_USD, na.rm = TRUE), 2),
sd_assets = base::round(stats::sd(assets_USD, na.rm = TRUE), 2),
n_debts = base::round(base::sum(!is.na(debts_USD)), 2),
mean_debts = base::round(base::mean(debts_USD, na.rm = TRUE), 2),
sd_debts = base::round(stats::sd(debts_USD, na.rm = TRUE), 2)
) |>
print_reactable(sorted_col = "country", width = 900)df_final <- df_final |>
dplyr::group_by(country) |>
dplyr::mutate(
income_USD_z_local = base::ifelse(
!is.na(income_annual_24_gross_USD),
(income_annual_24_gross_USD -
base::mean(income_annual_24_gross_USD, na.rm = TRUE))
/ stats::sd(income_annual_24_gross_USD, na.rm = TRUE),
NA_real_
),
assets_USD_z_local = base::ifelse(
!is.na(assets_USD),
(assets_USD - base::mean(assets_USD, na.rm = TRUE))
/ stats::sd(assets_USD, na.rm = TRUE),
NA_real_
),
debts_USD_z_local = base::ifelse(
!is.na(debts_USD),
(debts_USD - base::mean(debts_USD, na.rm = TRUE))
/ stats::sd(debts_USD, na.rm = TRUE),
NA_real_
),
income_USD_percentile_local = base::ifelse(
!is.na(income_annual_24_gross_USD),
dplyr::percent_rank(income_annual_24_gross_USD),
NA_real_
),
assets_USD_percentile_local = base::ifelse(
!is.na(assets_USD),
dplyr::percent_rank(assets_USD),
NA_real_
),
debts_USD_percentile_local = base::ifelse(
!is.na(debts_USD),
dplyr::percent_rank(debts_USD),
NA_real_
)
) |>
dplyr::ungroup() |>
dplyr::mutate(
income_USD_z_full = base::ifelse(
!is.na(income_annual_24_gross_USD),
(income_annual_24_gross_USD -
base::mean(income_annual_24_gross_USD, na.rm = TRUE))
/ stats::sd(income_annual_24_gross_USD, na.rm = TRUE),
NA_real_
),
assets_USD_z_full = base::ifelse(
!is.na(assets_USD),
(assets_USD - base::mean(assets_USD, na.rm = TRUE))
/ stats::sd(assets_USD, na.rm = TRUE),
NA_real_
),
debts_USD_z_full = base::ifelse(
!is.na(debts_USD),
(debts_USD - base::mean(debts_USD, na.rm = TRUE))
/ stats::sd(debts_USD, na.rm = TRUE),
NA_real_
),
income_USD_percentile_full = base::ifelse(
!is.na(income_annual_24_gross_USD),
dplyr::percent_rank(income_annual_24_gross_USD),
NA_real_
),
assets_USD_percentile_full = base::ifelse(
!is.na(assets_USD),
dplyr::percent_rank(assets_USD),
NA_real_
),
debts_USD_percentile_full = base::ifelse(
!is.na(debts_USD),
dplyr::percent_rank(debts_USD),
NA_real_
)
)
(df_gmh |> filter(is.na(income_cont)) |> nrow()) +
(df_gmh |> filter(income_cont == 0) |> nrow())[1] 2664
[1] 2664
[1] 2664
(df_gmh |> filter(is.na(income_annual_24) & income_type != "net") |> nrow()) +
(df_gmh |> filter(income_type == "net") |> nrow())[1] 13180
[1] 13180
[1] 13180
A0.4. Weights data
For Moldova, Romania, Nigeria, Montenegro, Angola, Morocco, Uruguay, Paraguay, Greece, Iran, Hungary, Kosovo, Yemen, Chile, and Uganda, values of 1 were used instead of weighted scores.
# Load weights computed based on age, education, sex, and country
weights <- base::readRDS("444_weighted_data.RDS")
# Sanity check: View participants without weights due to missing sociodemographics
weights |>
dplyr::filter(is.na(ps_weight), !is.na(education_recoded_cat)) |>
dplyr::select(ResponseId, country, age, sex_binary_cat, education_recoded_cat) |>
print_reactable(sorted_col = "country", width = 600)[1] 53799
df_gmh <- df_final |>
dplyr::left_join( weights |> dplyr::select(ResponseId, ps_weight), by = "ResponseId")
nrow(df_gmh)[1] 53799
# For a set of countries, recode the weight score to 1. Also recode NA to 1.
df_gmh <- df_gmh %>%
mutate(ps_weight = base::ifelse(
country %in% flagged_countries,
1,
ps_weight),
ps_weight_flag = base::ifelse(
country %in% flagged_countries,
1, 0
),
ps_weight_na = base::ifelse(
country %in% flagged_countries,
NA_real_, ps_weight
),
ps_weight = base::ifelse(is.na(ps_weight), 1, ps_weight)
)
# Sanity check: How many missing values in weights after transforming those to 1?
df_gmh |>
dplyr::summarise(
n_missing_weights = base::sum(is.na(ps_weight_na)),
perc_missing_weights = (n_missing_weights / dplyr::n()) * 100
) |> base::nrow()[1] 1
A0.5. Calculate Factor Scores
Global Factor Scores
fit_mpwb <- lavaan::cfa(
'mpwb =~ mpwb_competence + mpwb_emotional_stability + mpwb_engagement + mpwb_meaning + mpwb_optimism + mpwb_positive_emotion + mpwb_positive_relationships + mpwb_resilience + mpwb_self_esteem + mpwb_vitality',
data = df_gmh,
std.lv = TRUE,
estimator = "MLR",
sampling.weights = "ps_weight"
)
# We don't have any missing case on any of these variables
# and lavPredict keeps the same row order according to their manual
factor_scores <- lavaan::lavPredict(fit_mpwb, type = "lv")
df_gmh$mpwb_factor_global <- factor_scores[,1]
# View loadings
summary(fit_mpwb, fit.measures = TRUE, standardized = TRUE, rsquare = TRUE)lavaan 0.6-19 ended normally after 15 iterations
Estimator ML
Optimization method NLMINB
Number of model parameters 20
Number of observations 53799
Sampling weights variable ps_weight
Model Test User Model:
Standard Scaled
Test Statistic 11658.139 4049.674
Degrees of freedom 35 35
P-value (Chi-square) 0.000 0.000
Scaling correction factor 2.879
Yuan-Bentler correction (Mplus variant)
Model Test Baseline Model:
Test statistic 281635.351 94343.306
Degrees of freedom 45 45
P-value 0.000 0.000
Scaling correction factor 2.985
User Model versus Baseline Model:
Comparative Fit Index (CFI) 0.959 0.957
Tucker-Lewis Index (TLI) 0.947 0.945
Robust Comparative Fit Index (CFI) 0.959
Robust Tucker-Lewis Index (TLI) 0.947
Loglikelihood and Information Criteria:
Loglikelihood user model (H0) -840221.377 -840221.377
Scaling correction factor 2.624
for the MLR correction
Loglikelihood unrestricted model (H1) -834392.308 -834392.308
Scaling correction factor 2.786
for the MLR correction
Akaike (AIC) 1680482.755 1680482.755
Bayesian (BIC) 1680660.615 1680660.615
Sample-size adjusted Bayesian (SABIC) 1680597.055 1680597.055
Root Mean Square Error of Approximation:
RMSEA 0.079 0.046
90 Percent confidence interval - lower 0.077 0.045
90 Percent confidence interval - upper 0.080 0.047
P-value H_0: RMSEA <= 0.050 0.000 1.000
P-value H_0: RMSEA >= 0.080 0.025 0.000
Robust RMSEA 0.078
90 Percent confidence interval - lower 0.076
90 Percent confidence interval - upper 0.080
P-value H_0: Robust RMSEA <= 0.050 0.000
P-value H_0: Robust RMSEA >= 0.080 0.092
Standardized Root Mean Square Residual:
SRMR 0.031 0.031
Parameter Estimates:
Standard errors Sandwich
Information bread Observed
Observed information based on Hessian
Latent Variables:
Estimate Std.Err z-value P(>|z|) Std.lv Std.all
mpwb =~
mpwb_competenc 1.064 0.009 124.883 0.000 1.064 0.737
mpwb_mtnl_stbl 1.123 0.008 139.341 0.000 1.123 0.743
mpwb_engagemnt 0.773 0.009 85.772 0.000 0.773 0.577
mpwb_meaning 1.127 0.009 130.748 0.000 1.127 0.757
mpwb_optimism 1.213 0.008 145.537 0.000 1.213 0.763
mpwb_postv_mtn 1.202 0.008 155.756 0.000 1.202 0.821
mpwb_pstv_rltn 0.743 0.010 76.452 0.000 0.743 0.508
mpwb_resilienc 0.970 0.008 114.688 0.000 0.970 0.654
mpwb_self_estm 1.196 0.008 151.048 0.000 1.196 0.800
mpwb_vitality 1.189 0.008 153.797 0.000 1.189 0.762
Variances:
Estimate Std.Err z-value P(>|z|) Std.lv Std.all
.mpwb_competenc 0.951 0.012 80.689 0.000 0.951 0.457
.mpwb_mtnl_stbl 1.023 0.012 84.746 0.000 1.023 0.448
.mpwb_engagemnt 1.194 0.013 90.907 0.000 1.194 0.667
.mpwb_meaning 0.949 0.012 76.064 0.000 0.949 0.428
.mpwb_optimism 1.059 0.013 79.653 0.000 1.059 0.418
.mpwb_postv_mtn 0.698 0.010 67.005 0.000 0.698 0.326
.mpwb_pstv_rltn 1.583 0.017 95.070 0.000 1.583 0.742
.mpwb_resilienc 1.262 0.013 99.437 0.000 1.262 0.573
.mpwb_self_estm 0.804 0.011 71.741 0.000 0.804 0.360
.mpwb_vitality 1.024 0.012 83.341 0.000 1.024 0.420
mpwb 1.000 1.000 1.000
R-Square:
Estimate
mpwb_competenc 0.543
mpwb_mtnl_stbl 0.552
mpwb_engagemnt 0.333
mpwb_meaning 0.572
mpwb_optimism 0.582
mpwb_postv_mtn 0.674
mpwb_pstv_rltn 0.258
mpwb_resilienc 0.427
mpwb_self_estm 0.640
mpwb_vitality 0.580
[1] 0.995
[1] 0.995
r t p
1 0.995 22094.64 <.001
r t p
1 0.995 21926.65 <.001
# Sanity check: How many missing values in global factor scores?
df_gmh |>
dplyr::filter(is.na(mpwb_factor_global)) |>
base::nrow()[1] 0
Within Country Factor Scores
# Split data by country
country_list <- base::split(df_gmh, df_gmh$country)
# For each country we will fit CFA and extract scores
country_scores <- lapply(country_list, function(country_data) {
fit <- lavaan::cfa(
'mpwb =~ mpwb_competence + mpwb_emotional_stability + mpwb_engagement + mpwb_meaning + mpwb_optimism + mpwb_positive_emotion + mpwb_positive_relationships + mpwb_resilience + mpwb_self_esteem + mpwb_vitality',
data = country_data,
std.lv = TRUE,
estimator = "MLR",
sampling.weights = "ps_weight"
)
factor_scores <- lavaan::lavPredict(fit, type = "lv")[, 1]
country_data$mpwb_factor_within <- factor_scores
return(country_data)
})
# Recombine all countries
df_gmh <- dplyr::bind_rows(country_scores)
# Sanity check
df_gmh |>
dplyr::group_by(country) |>
dplyr::group_modify(~ {
tibble::tibble(
n = base::nrow(.x),
mean_factor_within = base::round(base::mean(.x$mpwb_factor_within, na.rm = TRUE), 2),
sd_factor_within = base::round(stats::sd(.x$mpwb_factor_within, na.rm = TRUE), 2),
cor_mpwb_sum = base::round(
stats::cor(.x$mpwb_factor_within, .x$mpwb_sum, use = "complete.obs"),
3
),
cor_mpwb_mean = base::round(
stats::cor(.x$mpwb_factor_within, .x$mpwb_mean, use = "complete.obs"),
3
),
cor_mpwb_sum_wt = base::round(
weighted_corr(.x, mpwb_factor_within, mpwb_sum)[[1]],
3
),
cor_mpwb_mean_wt = base::round(
weighted_corr(.x, mpwb_factor_within, mpwb_mean)[[1]],
3
)
)
}) |>
dplyr::ungroup() |>
print_reactable(sorted_col = "country", width = 500)Error in base::round(weighted_corr(.x, mpwb_factor_within, mpwb_sum)[[1]], : non-numeric argument to mathematical function
# Sanity check: How many missing values in factor scores?
df_gmh |>
dplyr::filter(is.na(mpwb_factor_within)) |>
base::nrow()[1] 0
A0.6 Saving data
# Write labels from codebook to df_gmh
codebook_label <- codebook |>
dplyr::select(variable, label) |>
(\(x) { stats::setNames(x$label, x$variable) })()
for (v in names(codebook_label)) {
labelled::var_label(df_gmh[[v]]) <- codebook_label[[v]]
}
# Save cleaned data
saveRDS(df_gmh, "999_cleaned_data.rds")
write.csv(df_gmh, "999_cleaned_data.csv", row.names = FALSE)
rm(v, codebook_label)A1. Findings’ Timeline
[1] "POSIXct" "POSIXt"
[1] "America/New_York"
Show the code
# Only consider main dataset
df_time <- df_gmh |>
dplyr::mutate(
StartDate = lubridate::as_date(StartDate)
) |>
dplyr::filter(!is.na(StartDate))
# Daily aggregates of mpwb_sum
daily_sum <- df_time |>
dplyr::group_by(StartDate) |>
dplyr::summarise(
n = dplyr::n(),
sum_x = base::sum(mpwb_sum),
sum_x2 = base::sum(mpwb_sum^2)
)
cum_sum <- daily_sum |>
dplyr::arrange(StartDate) |>
dplyr::mutate(
cum_n = cumsum(n),
cum_sum = cumsum(sum_x),
cum_sumsq = cumsum(sum_x2),
mean = cum_sum / cum_n,
var = dplyr::if_else(
cum_n > 1,
(cum_sumsq - (cum_sum^2) / cum_n) / (cum_n - 1),
NA_real_
),
se = sqrt(var / cum_n),
lo = mean - 1.96 * se,
hi = mean + 1.96 * se
)
p_daily_n <-
ggplot2::ggplot(daily_sum, ggplot2::aes(x = StartDate, y = n)) +
ggplot2::geom_col(width = 1, fill = "#11357f") +
ggplot2::labs(x = NULL, y = "Daily n\n") +
ggplot2::theme(
panel.grid.major.x = ggplot2::element_line(color = "#ddeded", linewidth = 0.25)
)
p_cum_mean <-
ggplot2::ggplot(cum_sum, ggplot2::aes(x = StartDate, y = mean)) +
ggplot2::geom_ribbon(
ggplot2::aes(ymin = lo, ymax = hi), alpha = 0.7, fill = "#abc7ff") +
ggplot2::geom_line(linewidth = 0.7, colour = "#11357f") +
ggplot2::geom_point(size = 0.9, colour = "#11357f") +
ggplot2::labs(x = "Date", y = "MPWB Sum (Rolling mean)\n") +
ggplot2::theme(
panel.grid.major.x = ggplot2::element_line(color = "#ddeded", linewidth = 0.25)
)
# Daily aggregates per measure
long_dim <- df_time |>
dplyr::select(StartDate, dplyr::all_of(mpwb_items)) |>
tidyr::pivot_longer(
cols = dplyr::all_of(mpwb_items),
names_to = "measure",
values_to = "value"
)
daily_dim <- long_dim |>
dplyr::group_by(StartDate, measure) |>
dplyr::summarise(
n = dplyr::n(),
sum_x = sum(value),
sum_x2 = sum(value^2)
)
cum_dim <- daily_dim |>
dplyr::group_by(measure) |>
dplyr::arrange(StartDate, .by_group = TRUE) |>
dplyr::mutate(
cum_n = base::cumsum(n),
cum_sum = base::cumsum(sum_x),
cum_sumsq = base::cumsum(sum_x2),
mean = cum_sum / cum_n,
var = dplyr::if_else(
cum_n > 1,
(cum_sumsq - (cum_sum^2) / cum_n) / (cum_n - 1),
NA_real_
),
se = base::sqrt(var / cum_n),
lo = mean - 1.96 * se,
hi = mean + 1.96 * se
) |>
dplyr::mutate(measure_lab = dplyr::recode(measure, !!!mpwb_labels))
p_dims <-
ggplot2::ggplot(cum_dim, ggplot2::aes(x = StartDate, y = mean)) +
ggplot2::geom_ribbon(
ggplot2::aes(ymin = lo, ymax = hi), alpha = 0.7, fill = "#abc7ff") +
ggplot2::geom_line(linewidth = 0.7, colour = "#11357f") +
ggplot2::geom_point(size = 0.8, colour = "#11357f") +
ggplot2::facet_wrap(~ measure_lab, ncol = 2) +
ggplot2::labs(x = "Date", y = "Rolling mean") +
theme(
panel.grid.major.x = ggplot2::element_line(color = "#ddeded", linewidth = 0.25),
)
# Overall
daily_overall <- df_time |>
tidyr::pivot_longer(
cols = mpwb_sum,
names_to = "measure",
values_to = "value"
) |>
dplyr::group_by(StartDate, measure) |>
dplyr::summarise(
n = dplyr::n(),
mean_val = base::mean(value, na.rm = TRUE),
sd_val = stats::sd(value, na.rm = TRUE),
se_val = if (n > 1) sd_val / base::sqrt(n) else NA_real_,
lo_val = mean_val - 1.96 * se_val,
hi_val = mean_val + 1.96 * se_val
)
range_dates <- base::range(daily_overall$StartDate, na.rm = TRUE)
breaks_daily <- base::seq(range_dates[1], range_dates[2], by = "1 day")
p_overall <-
ggplot2::ggplot(daily_overall, ggplot2::aes(x = StartDate, y = mean_val)) +
ggplot2::geom_ribbon(
ggplot2::aes(ymin = lo_val, ymax = hi_val), alpha = 0.7, fill = "#abc7ff") +
ggplot2::geom_line(linewidth = 0.7, colour = "#11357f") +
ggplot2::geom_point(size = 0.9, colour = "#11357f") +
ggplot2::scale_x_date(breaks = breaks_daily,
labels = daily_overall |>
dplyr::mutate(
day = base::format(StartDate, "%d") |> gsub("^0", "", x = _),
mon = base::format(StartDate, "%b"),
label = if (day == "1") paste(day, mon) else day
) |>
dplyr::pull(label),
expand = base::c(0.01, 0.01)
) +
ggplot2::labs( x = NULL, y = "MPWB sum") +
theme_gmh +
theme(axis.text.x = element_text(
margin = margin(t = 1), face = "bold", angle = 45, hjust = 1, vjust = 1, size = 7
),
panel.grid.major.x = ggplot2::element_line(
color = "#ddeded", linewidth = 0.25
)
)Show the code
# Daily aggregates of ls
daily_sum_ls <- df_time |>
dplyr::group_by(StartDate) |>
dplyr::summarise(
n = dplyr::n(),
sum_x = base::sum(life_satisfaction),
sum_x2 = base::sum(life_satisfaction^2)
)
cum_sum_ls <- daily_sum_ls |>
dplyr::arrange(StartDate) |>
dplyr::mutate(
cum_n = cumsum(n),
cum_sum = cumsum(sum_x),
cum_sumsq = cumsum(sum_x2),
mean = cum_sum / cum_n,
var = dplyr::if_else(
cum_n > 1,
(cum_sumsq - (cum_sum^2) / cum_n) / (cum_n - 1),
NA_real_
),
se = sqrt(var / cum_n),
lo = mean - 1.96 * se,
hi = mean + 1.96 * se
)
p_daily_n_ls <-
ggplot2::ggplot(daily_sum_ls, ggplot2::aes(x = StartDate, y = n)) +
ggplot2::geom_col(width = 1, fill = "#11357f") +
ggplot2::labs(x = NULL, y = "Daily n\n") +
ggplot2::theme(
panel.grid.major.x = ggplot2::element_line(color = "#ddeded", linewidth = 0.25)
)
p_cum_mean_ls <-
ggplot2::ggplot(cum_sum_ls, ggplot2::aes(x = StartDate, y = mean)) +
ggplot2::geom_ribbon(
ggplot2::aes(ymin = lo, ymax = hi), alpha = 0.7, fill = "#abc7ff") +
ggplot2::geom_line(linewidth = 0.7, colour = "#11357f") +
ggplot2::geom_point(size = 0.9, colour = "#11357f") +
ggplot2::labs(x = "Date", y = "Life satisfaction (Rolling mean)\n") +
ggplot2::theme(
panel.grid.major.x = ggplot2::element_line(color = "#ddeded", linewidth = 0.25)
)
# Overall
daily_overall_ls <- df_time |>
tidyr::pivot_longer(
cols = life_satisfaction,
names_to = "measure",
values_to = "value"
) |>
dplyr::group_by(StartDate, measure) |>
dplyr::summarise(
n = dplyr::n(),
mean_val = base::mean(value, na.rm = TRUE),
sd_val = stats::sd(value, na.rm = TRUE),
se_val = if (n > 1) sd_val / base::sqrt(n) else NA_real_,
lo_val = mean_val - 1.96 * se_val,
hi_val = mean_val + 1.96 * se_val
)
range_dates <- base::range(daily_overall_ls$StartDate, na.rm = TRUE)
breaks_daily <- base::seq(range_dates[1], range_dates[2], by = "1 day")
p_overall_ls <-
ggplot2::ggplot(daily_overall_LS, ggplot2::aes(x = StartDate, y = mean_val)) +
ggplot2::geom_ribbon(
ggplot2::aes(ymin = lo_val, ymax = hi_val), alpha = 0.7, fill = "#abc7ff") +
ggplot2::geom_line(linewidth = 0.7, colour = "#11357f") +
ggplot2::geom_point(size = 0.9, colour = "#11357f") +
ggplot2::scale_x_date(breaks = breaks_daily,
labels = daily_overall |>
dplyr::mutate(
day = base::format(StartDate, "%d") |> gsub("^0", "", x = _),
mon = base::format(StartDate, "%b"),
label = if (day == "1") paste(day, mon) else day
) |>
dplyr::pull(label),
expand = base::c(0.01, 0.01)
) +
ggplot2::labs( x = NULL, y = "Life satisfaction") +
theme_gmh +
theme(axis.text.x = element_text(
margin = margin(t = 1), face = "bold", angle = 45, hjust = 1, vjust = 1, size = 7
),
panel.grid.major.x = ggplot2::element_line(
color = "#ddeded", linewidth = 0.25
)
)Error: object 'daily_overall_LS' not found
Show the code
df_time_phq <- df_gmh |>
dplyr::mutate(
StartDate = lubridate::as_date(StartDate)
) |>
dplyr::filter(!is.na(StartDate) & !is.na(gad_worry))
# Daily aggregates of mpwb_sum
daily_sum_phq <- df_time_phq |>
dplyr::group_by(StartDate) |>
dplyr::summarise(
n = dplyr::n(),
sum_x = base::sum(phq4_sum),
sum_x2 = base::sum(phq4_sum^2)
)
cum_sum_phq <- daily_sum_phq |>
dplyr::arrange(StartDate) |>
dplyr::mutate(
cum_n = cumsum(n),
cum_sum = cumsum(sum_x),
cum_sumsq = cumsum(sum_x2),
mean = cum_sum / cum_n,
var = dplyr::if_else(
cum_n > 1,
(cum_sumsq - (cum_sum^2) / cum_n) / (cum_n - 1),
NA_real_
),
se = sqrt(var / cum_n),
lo = mean - 1.96 * se,
hi = mean + 1.96 * se
)
p_daily_n_phq <-
ggplot2::ggplot(daily_sum_phq, ggplot2::aes(x = StartDate, y = n)) +
ggplot2::geom_col(width = 1, fill = "#11357f") +
ggplot2::labs(x = NULL, y = "Daily n\n") +
ggplot2::theme(
panel.grid.major.x = ggplot2::element_line(color = "#ddeded", linewidth = 0.25)
)
p_cum_mean_phq <-
ggplot2::ggplot(cum_sum_phq, ggplot2::aes(x = StartDate, y = mean)) +
ggplot2::geom_ribbon(
ggplot2::aes(ymin = lo, ymax = hi), alpha = 0.7, fill = "#abc7ff") +
ggplot2::geom_line(linewidth = 0.7, colour = "#11357f") +
ggplot2::geom_point(size = 0.9, colour = "#11357f") +
ggplot2::labs(x = "Date", y = "PHQ-4 (Rolling mean)\n") +
ggplot2::theme(
panel.grid.major.x = ggplot2::element_line(color = "#ddeded", linewidth = 0.25)
)
# Daily aggregates per measure
long_dim_phq <- df_time_phq |>
dplyr::select(StartDate, dplyr::all_of(phq4_items)) |>
tidyr::pivot_longer(
cols = dplyr::all_of(phq4_items),
names_to = "measure",
values_to = "value"
)
daily_dim_phq <- long_dim_phq |>
dplyr::group_by(StartDate, measure) |>
dplyr::summarise(
n = dplyr::n(),
sum_x = sum(value),
sum_x2 = sum(value^2)
)
cum_dim_phq <- daily_dim_phq |>
dplyr::group_by(measure) |>
dplyr::arrange(StartDate, .by_group = TRUE) |>
dplyr::mutate(
cum_n = base::cumsum(n),
cum_sum = base::cumsum(sum_x),
cum_sumsq = base::cumsum(sum_x2),
mean = cum_sum / cum_n,
var = dplyr::if_else(
cum_n > 1,
(cum_sumsq - (cum_sum^2) / cum_n) / (cum_n - 1),
NA_real_
),
se = base::sqrt(var / cum_n),
lo = mean - 1.96 * se,
hi = mean + 1.96 * se
)
p_dims_phq <-
ggplot2::ggplot(cum_dim_phq, ggplot2::aes(x = StartDate, y = mean)) +
ggplot2::geom_ribbon(
ggplot2::aes(ymin = lo, ymax = hi), alpha = 0.7, fill = "#abc7ff") +
ggplot2::geom_line(linewidth = 0.7, colour = "#11357f") +
ggplot2::geom_point(size = 0.8, colour = "#11357f") +
ggplot2::facet_wrap(~ measure_lab, ncol = 2) +
ggplot2::labs(x = "Date", y = "Rolling mean") +
theme(
panel.grid.major.x = ggplot2::element_line(color = "#ddeded", linewidth = 0.25),
)
# Overall
daily_overall_phq <- df_time |>
tidyr::pivot_longer(
cols = phq4_sum,
names_to = "measure",
values_to = "value"
) |>
dplyr::group_by(StartDate, measure) |>
dplyr::summarise(
n = dplyr::n(),
mean_val = base::mean(value, na.rm = TRUE),
sd_val = stats::sd(value, na.rm = TRUE),
se_val = if (n > 1) sd_val / base::sqrt(n) else NA_real_,
lo_val = mean_val - 1.96 * se_val,
hi_val = mean_val + 1.96 * se_val
)
range_dates <- base::range(daily_overall_phq$StartDate, na.rm = TRUE)
breaks_daily <- base::seq(range_dates[1], range_dates[2], by = "1 day")
p_overall_phq <-
ggplot2::ggplot(daily_overall_phq, ggplot2::aes(x = StartDate, y = mean_val)) +
ggplot2::geom_ribbon(
ggplot2::aes(ymin = lo_val, ymax = hi_val), alpha = 0.7, fill = "#abc7ff") +
ggplot2::geom_line(linewidth = 0.7, colour = "#11357f") +
ggplot2::geom_point(size = 0.9, colour = "#11357f") +
ggplot2::scale_x_date(breaks = breaks_daily,
labels = daily_overall |>
dplyr::mutate(
day = base::format(StartDate, "%d") |> gsub("^0", "", x = _),
mon = base::format(StartDate, "%b"),
label = if (day == "1") paste(day, mon) else day
) |>
dplyr::pull(label),
expand = base::c(0.01, 0.01)
) +
ggplot2::labs( x = NULL, y = "PHQ-4") +
theme_gmh +
theme(axis.text.x = element_text(
margin = margin(t = 1), face = "bold", angle = 45, hjust = 1, vjust = 1, size = 7
),
panel.grid.major.x = ggplot2::element_line(
color = "#ddeded", linewidth = 0.25
)
)cowplot::plot_grid(
p_daily_n_phq, p_cum_mean_phq,
ncol = 1, rel_heights = c(0.35, 0.65), align = "v"
)Error in `combine_vars()`:
! At least one layer must contain all faceting variables: `measure_lab`
✖ Plot is missing `measure_lab`
✖ Layer 1 is missing `measure_lab`
✖ Layer 2 is missing `measure_lab`
✖ Layer 3 is missing `measure_lab`
A2. MPWB Descriptives
Table 1
# Extract mpwb item labels from codebook
mpwb_measure <- codebook |>
dplyr::filter(variable %in% mpwb_items) |>
dplyr::pull(label, variable)
# Calculate estimates
mpwb_base1 <- df_gmh |>
dplyr::select(ps_weight, dplyr::all_of(mpwb_items)) |>
tidyr::pivot_longer(
cols = -ps_weight, names_to = "variable", values_to = "value"
) |>
# Calculate how many participants responded 6 or 7 (flourishing) to mpwb items
dplyr::mutate(is_flourish = value %in% c(6, 7)) |>
dplyr::group_by(variable) |>
dplyr::summarise(
mean_mpwb = Hmisc::wtd.mean(value, ps_weight, na.rm = TRUE),
sd_mpwb = sqrt(Hmisc::wtd.var(value, ps_weight, na.rm = TRUE)),
# is_flourish is boolean, so mean gives proportion
pct = 100 * Hmisc::wtd.mean(is_flourish, ps_weight, na.rm = TRUE),
n = dplyr::n()
) |>
dplyr::mutate(
dimension = mpwb_labels[variable],
description = mpwb_measure[variable],
mean_sd = sprintf("%.2f (%.2f)", mean_mpwb, sd_mpwb),
pct_flourishing = sprintf("%.1f", pct),
m = mean_mpwb,
) |>
dplyr::arrange(-m) |>
dplyr::select(dimension, description, mean_sd, pct_flourishing, n)
mpwb_table1 <- mpwb_base1 |>
dplyr::mutate(`% flourishing` = pct_flourishing) |>
dplyr::select(dimension, description, mean_sd, `% flourishing`) |>
gt::gt() |>
gt::cols_label(
dimension = "Dimension",
description = "Measure",
mean_sd = gt::md("*M*<sub>weighted</sub> (*SD*)"),
`% flourishing` = gt::md("% flourishing<sub>weighted</sub>")
) |>
gt::fmt_markdown(columns = c("mean_sd", "% flourishing")) |>
gt::tab_options(
table.border.top.color = "black",
table.border.top.style = "solid",
table.border.top.width = gt::px(1),
table.border.bottom.color = "black",
table.border.bottom.style = "solid",
table.border.bottom.width = gt::px(1),
heading.border.bottom.color = "black",
heading.border.bottom.style = "solid",
heading.border.bottom.width = gt::px(1),
table_body.hlines.color = "white",
row.striping.include_table_body = FALSE
); mpwb_table1| Dimension | Measure | Mweighted (SD) | |
|---|---|---|---|
| Positive relationships | I receive help and support from people I am close to when I need it. | 5.05 (1.46) | 38.5 |
| Meaning | I feel what I do in my life is valuable and worthwhile. | 4.92 (1.49) | 35.6 |
| Competence | I feel a sense of accomplishment from what I do. | 4.83 (1.44) | 31.8 |
| Engagement | I feel absorbed in what I am doing. | 4.81 (1.34) | 28.6 |
| Self-esteem | I feel positive about myself. | 4.80 (1.50) | 32.2 |
| Optimism | I am optimistic about my future. | 4.70 (1.59) | 31.3 |
| Positive emotion | I feel happy. | 4.69 (1.46) | 27.6 |
| Emotional stability | I feel calm and peaceful. | 4.46 (1.51) | 23.4 |
| Resilience | I recover quickly from things that go wrong in my life. | 4.44 (1.48) | 22.1 |
| Vitality | I feel full of energy. | 4.19 (1.56) | 19.2 |
file:////var/folders/57/fnv45qmj3_v67tzm1lxyx18r0000gn/T//RtmpDIh10E/file6ab961475642.html screenshot completed
Item Flourishing Estimates Per Country
df_gmh |>
dplyr::select(country, ps_weight, dplyr::all_of(mpwb_items)) |>
tidyr::pivot_longer(
cols = -c(country, ps_weight),
names_to = "variable",
values_to = "value"
) |>
dplyr::mutate(is_flourish = value %in% c(6, 7)) |>
dplyr::group_by(country, variable) |>
dplyr::summarise(
mean_mpwb = Hmisc::wtd.mean(value, ps_weight, na.rm = TRUE),
sd_mpwb = base::sqrt(Hmisc::wtd.var(value, ps_weight, na.rm = TRUE)),
pct = 100 * Hmisc::wtd.mean(is_flourish, ps_weight, na.rm = TRUE),
n = dplyr::n(),
.groups = "drop"
) |>
dplyr::mutate(
dimension = mpwb_labels[variable],
description = mpwb_measure[variable],
mean_sd = sprintf("%.2f (%.2f)", mean_mpwb, sd_mpwb),
pct_flourishing = sprintf("%.1f", pct),
m = mean_mpwb
) |>
dplyr::arrange(country, -m) |>
dplyr::select(country, dimension, description, mean_sd, pct_flourishing, n) |>
print_reactable(sorted_col = c("country"), width = 800)Flourishing Estimates considering PHQ-4 (rescaled)
# Using the simplified rule of scoring 60 or above on MPWB and two or below on PHQ-4, we find X% to be flourishing, ranging from X% in [lowest country] to X% in [highest country]. Future work on this data may provide more statistically robust approaches and interpretations, as well as incorporate other measures such as income and outlook.
# Flag flourishing (MPWB ≥ 60 and PHQ-4 rescaled ≤ 2)
df_flourish <- df_gmh |>
dplyr::filter(!is.na(gad_worry)) |>
dplyr::mutate(
flourishing = base::as.integer(mpwb_sum >= 60 & phq4_sum_rec <= 2)
)
table(df_flourish$phq4_sum_rec, df_flourish$flourishing)
0 1
0 947 870
1 1695 815
2 3362 901
3 6208 0
4 13246 0
5 1858 0
6 2246 0
7 1111 0
8 1552 0
9 770 0
10 1015 0
11 482 0
12 1431 0
0 1
10 59 0
11 22 0
12 30 0
13 35 0
14 56 0
15 48 0
16 72 0
17 78 0
18 95 0
19 92 0
20 108 0
21 128 0
22 160 0
23 191 0
24 191 0
25 220 0
26 259 0
27 236 0
28 323 0
29 348 0
30 400 0
31 445 0
32 494 0
33 536 0
34 636 0
35 668 0
36 737 0
37 725 0
38 884 0
39 900 0
40 1087 0
41 1065 0
42 1121 0
43 1148 0
44 1218 0
45 1318 0
46 1392 0
47 1381 0
48 1545 0
49 1508 0
50 1609 0
51 1448 0
52 1345 0
53 1290 0
54 1212 0
55 1172 0
56 1125 0
57 1011 0
58 901 0
59 813 0
60 426 375
61 342 307
62 263 286
63 213 283
64 208 247
65 127 198
66 123 180
67 83 167
68 82 148
69 48 112
70 123 283
# Aggregate weighted percentage flourishing
df_flourish |>
dplyr::summarise(
pct_flourishing =
100 * Hmisc::wtd.mean(flourishing, ps_weight, na.rm = TRUE)
) |>
dplyr::mutate(
pct_flourishing = base::round(pct_flourishing, 1)
)# A tibble: 1 × 1
pct_flourishing
<dbl>
1 6.5
# 4) Percentage flourishing by country
country_pct <- df_flourish |>
dplyr::group_by(country) |>
dplyr::summarise(
pct_flourishing =
100 * Hmisc::wtd.mean(flourishing, ps_weight, na.rm = TRUE),
) |>
dplyr::mutate(
pct_flourishing = base::round(pct_flourishing, 1)
) |>
dplyr::arrange(pct_flourishing)
# Country with the most and least flourishing
country_pct |>
dplyr::slice_max(pct_flourishing, n = 1, with_ties = FALSE)# A tibble: 1 × 2
country pct_flourishing
<chr> <dbl>
1 North Macedonia 24
# A tibble: 1 × 2
country pct_flourishing
<chr> <dbl>
1 Madagascar 0
# Sanity check
df_flourish |>
dplyr::filter(country == "Madagascar", mpwb_sum >= 60, phq4_sum_rec <= 2) |>
base::nrow()[1] 0
A3. Optional Section Completion Rates
# Overall completion rate of the optional section
df_gmh |>
dplyr::summarise(
n_total = dplyr::n(),
n_optional = base::sum(n_items_after == 10),
pct_optional = round(100 * n_optional / n_total, 1)
)# A tibble: 1 × 3
n_total n_optional pct_optional
<int> <int> <dbl>
1 53799 28229 52.5
# Overall completion rate of the optional section by country
opt_by_country <- df_gmh |>
dplyr::group_by(country) |>
dplyr::summarise(
n_total = dplyr::n(),
n_optional = base::sum(n_items_after == 10),
pct_optional = round(100 * n_optional / n_total, 1)
)
opt_by_country |> dplyr::slice_min(pct_optional, n = 1, with_ties = FALSE)# A tibble: 1 × 4
country n_total n_optional pct_optional
<chr> <int> <int> <dbl>
1 Japan 431 18 4.2
# A tibble: 1 × 4
country n_total n_optional pct_optional
<chr> <int> <int> <dbl>
1 Ukraine 654 530 81
# Completion time of participants that completed the optional section
df_gmh |>
dplyr::filter(n_items_after == 10) |>
dplyr::summarise(med = stats::median(duration_sec, na.rm = TRUE) / 60) |>
dplyr::pull(med)[1] 6.316667
# Completion time of participants that completed the mandatory section only
df_gmh |>
dplyr::filter(n_items_after == 0) |>
dplyr::summarise(med = stats::median(duration_sec, na.rm = TRUE) / 60) |>
dplyr::pull(med)[1] 6.8
A4. Aggregate Descriptive Information
# First, recode education and employment variables
lvl_emp <- levels(df_gmh$employment_primary)
lvl_edu <- levels(df_gmh$education_recoded_cat)
df_agg <- df_gmh |>
dplyr::mutate(
dur_min = duration_sec / 60,
# Recode NA as Removed for sex and income variables
sex_reviewed_cat = base::factor(
dplyr::case_when(
is.na(sex_reviewed_cat) ~ "Removed",
TRUE ~ sex_reviewed_cat
),
levels = c("Male","Female","Other","Removed"),
ordered = FALSE
),
income_merg_cat = base::factor(
dplyr::case_when(
base::is.na(income_merg_cat) ~ "Removed",
TRUE ~ income_merg_cat
),
levels = c(
"No income",
"Second decile",
"Third decile",
"Fourth decile",
"Fifth decile",
"Sixth decile",
"Seventh decile",
"Eighth decile",
"Ninth decile",
"Tenth decile",
"Removed"
),
ordered = TRUE
),
# Consider self-employed as part-time employed
# (31 from the sponsored dataset provided by the team representing Ireland)
employment_primary = as.character(employment_primary),
employment_primary = dplyr::if_else(
employment_irl == "Self-employed",
"Employed/working part-time (less than 25 hours per week)",
employment_primary,
missing = employment_primary
),
employment_primary = base::factor(employment_primary, levels = lvl_emp),
# Consider Inclusive education as Technical
# (30 from the Peru)
education_recoded_cat = as.character(education_recoded_cat),
education_recoded_cat = dplyr::if_else(
education_cat == "Inclusive education",
"Technical",
education_recoded_cat,
missing = education_recoded_cat
),
education_recoded_cat = factor(education_recoded_cat, levels = lvl_edu)
)
# Function for categorical variables
tbl_block <- function(df, var_name, label, drop_na = FALSE, show_header = FALSE) {
x <- df[[var_name]]
keep <- if (drop_na) !is.na(x) else rep(TRUE, length(x))
df_keep <- df[keep, , drop = FALSE]
# unweighted counts and percentages
tab_u <- df_keep |>
dplyr::count(.data[[var_name]], name = "n_u") |>
dplyr::mutate(
level = as.character(.data[[var_name]]),
pct_u = 100 * n_u / sum(n_u)
)
# weighted counts and percentages
tab_w <- df_keep |>
dplyr::group_by(.data[[var_name]]) |>
dplyr::summarise(n_w = sum(ps_weight, na.rm = TRUE)) |>
dplyr::mutate(
level = as.character(.data[[var_name]]),
pct_w = 100 * n_w / sum(n_w)
)
# join weighted/unweighted
tab <- dplyr::full_join(tab_u, tab_w, by = "level")
# format display
tab <- tab |>
dplyr::mutate(
unweighted = paste0(
ifelse(is.na(n_u), 0, n_u),
" (", sprintf("%.1f", ifelse(is.na(pct_u), 0, pct_u)), "%)"
),
weighted = paste0(
round(ifelse(is.na(n_w), 0, n_w), 0),
" (", sprintf("%.1f", ifelse(is.na(pct_w), 0, pct_w)), "%)"
)
)
header <- if (show_header) "n (%)" else ""
dplyr::bind_rows(
tibble::tibble(Variable = label, Unweighted = header, Weighted = header),
tab |>
dplyr::transmute(
Variable = paste0("\u00A0\u00A0\u00A0", level),
Unweighted = unweighted,
Weighted = weighted
)
)
}
# duration
dur_q_u <- stats::quantile(df_agg$dur_min, probs = c(0.25, 0.5, 0.75), na.rm = TRUE, names = FALSE)
dur_Md_u <- as.numeric(dur_q_u[2])
dur_IQR_u <- as.numeric(dur_q_u[3] - dur_q_u[1])table2 <- dplyr::bind_rows(
tibble::tibble(Variable = "", Unweighted = "", Weighted = ""),
tibble::tibble(Variable = "", Unweighted = "M (SD)", Weighted = "M (SD)"),
tibble::tibble(
Variable = "MPWB sum (range 10–70)",
Unweighted =
sprintf("%.2f (%.2f)",
base::mean(df_agg$mpwb_sum, na.rm = TRUE),
stats::sd(df_agg$mpwb_sum, na.rm = TRUE)),
Weighted =
sprintf("%.2f (%.2f)",
Hmisc::wtd.mean(
df_agg$mpwb_sum, df_gmh$ps_weight, na.rm = TRUE),
sqrt(Hmisc::wtd.var(
df_agg$mpwb_sum, df_gmh$ps_weight, na.rm = TRUE)))
),
tibble::tibble(
Variable = "Life satisfaction (range 1–7)",
Unweighted =
sprintf("%.2f (%.2f)",
base::mean(df_agg$life_satisfaction, na.rm = TRUE),
stats::sd(df_agg$life_satisfaction, na.rm = TRUE)),
Weighted =
sprintf("%.2f (%.2f)",
Hmisc::wtd.mean(
df_agg$life_satisfaction, df_gmh$ps_weight, na.rm = TRUE),
sqrt(Hmisc::wtd.var(
df_agg$life_satisfaction, df_gmh$ps_weight, na.rm = TRUE)))
),
tibble::tibble(Variable = "", Unweighted = "", Weighted = ""),
tibble::tibble(Variable = "", Unweighted = "Md (IQR)", Weighted = "Md (IQR)"),
tibble::tibble(
Variable = "Duration (minutes)",
Unweighted = sprintf("%.2f (%.2f)", dur_Md_u, dur_IQR_u),
Weighted = "\u2014"
),
tibble::tibble(Variable = "", Unweighted = "", Weighted = ""),
tbl_block(df_agg, "income_merg_cat", "Household income", show_header = TRUE),
tbl_block(df_agg, "household_size_group", "Household size"),
tbl_block(df_agg, "age_group", "Age group"),
tbl_block(df_agg, "sex_reviewed_cat", "Sex"),
tbl_block(df_agg, "education_recoded_cat", "Education level"),
tbl_block(df_agg, "employment_primary", "Employment status"),
tbl_block(df_agg, "citizenship_cat", "Citizenship"),
tbl_block(df_agg, "childhood_SES_cat", "Childhood socioeconomic status", drop_na = TRUE),
tbl_block(df_agg, "work_arrangement_cat", "Work arrangement", drop_na = TRUE)
)
gt_table2 <- gt::gt(table2) |>
gt::cols_label(
Variable = "Variable",
Unweighted = "Unweighted",
Weighted = "Weighted"
) |>
gt::tab_options(
data_row.padding = gt::px(4),
table.border.top.color = "black",
table.border.top.width = gt::px(0.5),
table.border.bottom.color = "black",
table.border.bottom.width = gt::px(0.5),
heading.border.bottom.color = "black",
heading.border.bottom.width = gt::px(0.5),
table_body.hlines.color = "white",
row.striping.include_table_body = FALSE
) |>
gt::tab_style(
style = gt::cell_text(style = "italic"),
locations = gt::cells_body(
rows =
Unweighted %in% c("M (SD)", "Md (IQR)", "n (%)") |
Weighted %in% c("M (SD)", "Md (IQR)", "n (%)"),
columns = c("Unweighted", "Weighted")
)
); gt_table2| Variable | Unweighted | Weighted |
|---|---|---|
| M (SD) | M (SD) | |
| MPWB sum (range 10–70) | 47.43 (10.94) | 46.87 (11.09) |
| Life satisfaction (range 1–7) | 6.42 (2.38) | 6.30 (2.46) |
| Md (IQR) | Md (IQR) | |
| Duration (minutes) | 6.17 (4.52) | — |
| Household income | n (%) | n (%) |
| No income | 1948 (3.6%) | 1509 (4.1%) |
| Second decile | 6194 (11.5%) | 4848 (13.0%) |
| Third decile | 6583 (12.2%) | 4839 (13.0%) |
| Fourth decile | 6675 (12.4%) | 4518 (12.2%) |
| Fifth decile | 6360 (11.8%) | 4342 (11.7%) |
| Sixth decile | 5637 (10.5%) | 3870 (10.4%) |
| Seventh decile | 5001 (9.3%) | 3239 (8.7%) |
| Eighth decile | 4880 (9.1%) | 3195 (8.6%) |
| Ninth decile | 3793 (7.1%) | 2385 (6.4%) |
| Tenth decile | 6011 (11.2%) | 3880 (10.4%) |
| Removed | 717 (1.3%) | 551 (1.5%) |
| Household size | ||
| 1 | 13846 (25.7%) | 10113 (27.2%) |
| 2 | 13726 (25.5%) | 9900 (26.6%) |
| 3 | 9031 (16.8%) | 5974 (16.1%) |
| 4-5 | 13115 (24.4%) | 8494 (22.8%) |
| 6-20 | 4081 (7.6%) | 2693 (7.2%) |
| Age group | ||
| 18-25 | 11550 (21.5%) | 8195 (22.0%) |
| 26-44 | 28059 (52.2%) | 15684 (42.2%) |
| 45-64 | 11970 (22.2%) | 10118 (27.2%) |
| 65-74 | 1776 (3.3%) | 2555 (6.9%) |
| 75+ | 444 (0.8%) | 623 (1.7%) |
| Sex | ||
| Male | 20738 (38.5%) | 16605 (44.7%) |
| Female | 32607 (60.6%) | 20116 (54.1%) |
| Other | 352 (0.7%) | 352 (0.9%) |
| Removed | 102 (0.2%) | 102 (0.3%) |
| Education level | ||
| Less than secondary | 1346 (2.5%) | 2190 (5.9%) |
| Secondary | 10226 (19.0%) | 11556 (31.1%) |
| Technical | 5693 (10.6%) | 4602 (12.4%) |
| University | 19276 (35.8%) | 10454 (28.1%) |
| Advanced | 17258 (32.1%) | 8373 (22.5%) |
| Employment status | ||
| Not in paid employment (by choice/health) | 3263 (6.1%) | 2808 (7.6%) |
| Not in paid employment (looking for work) | 3883 (7.2%) | 2815 (7.6%) |
| Student non-working (Full or part-time) | 7122 (13.2%) | 4814 (12.9%) |
| Employed/working full-time (25+ hours per week) | 30478 (56.7%) | 18986 (51.1%) |
| Employed/working part-time (less than 25 hours per week) | 5961 (11.1%) | 4394 (11.8%) |
| Retired | 2288 (4.3%) | 2882 (7.8%) |
| Military service | 804 (1.5%) | 475 (1.3%) |
| Citizenship | ||
| Citizen | 49136 (91.3%) | 33763 (90.8%) |
| Non-citizen (Permanent Resident) | 1400 (2.6%) | 1064 (2.9%) |
| Born outside country (Citizen) | 1058 (2.0%) | 779 (2.1%) |
| Born outside country (Non-citizen, Permanent Resident) | 574 (1.1%) | 345 (0.9%) |
| Born outside country (Non-citizen, Non-permanent Resident) | 1631 (3.0%) | 1224 (3.3%) |
| Childhood socioeconomic status | ||
| Poor | 4379 (11.4%) | 3405 (12.8%) |
| Below average but not poor | 10022 (26.0%) | 7203 (27.1%) |
| Around average | 14523 (37.7%) | 9766 (36.7%) |
| Above average but not wealthy | 8439 (21.9%) | 5498 (20.7%) |
| Wealthy | 1115 (2.9%) | 715 (2.7%) |
| Work arrangement | ||
| I work entirely in-person (i.e., in an office, on-site) | 17150 (53.2%) | 11582 (55.3%) |
| I mostly work in-person, with occasional remote days | 5857 (18.2%) | 3572 (17.1%) |
| I work about evenly in-person/remote | 3375 (10.5%) | 2061 (9.8%) |
| I mostly work remotely, with occasional in-person days | 3077 (9.5%) | 1894 (9.0%) |
| I work entirely remotely | 2776 (8.6%) | 1827 (8.7%) |
file:////var/folders/57/fnv45qmj3_v67tzm1lxyx18r0000gn/T//RtmpDIh10E/file6ab976b74de5.html screenshot completed
full_tbl_agg <- dplyr::bind_rows(
tibble::tibble(Variable = "", Unweighted = "", Weighted = ""),
tibble::tibble(Variable = "", Unweighted = "M (SD)", Weighted = "M (SD)"),
tibble::tibble(
Variable = "MPWB sum (range 10–70)",
Unweighted =
sprintf("%.2f (%.2f)",
base::mean(df_agg$mpwb_sum, na.rm = TRUE),
stats::sd(df_agg$mpwb_sum, na.rm = TRUE)),
Weighted =
sprintf("%.2f (%.2f)",
Hmisc::wtd.mean(
df_agg$mpwb_sum, df_gmh$ps_weight, na.rm = TRUE),
sqrt(Hmisc::wtd.var(
df_agg$mpwb_sum, df_gmh$ps_weight, na.rm = TRUE)))
),
tibble::tibble(
Variable = "Life satisfaction (range 1–7)",
Unweighted =
sprintf("%.2f (%.2f)",
base::mean(df_agg$life_satisfaction, na.rm = TRUE),
stats::sd(df_agg$life_satisfaction, na.rm = TRUE)),
Weighted =
sprintf("%.2f (%.2f)",
Hmisc::wtd.mean(
df_agg$life_satisfaction, df_gmh$ps_weight, na.rm = TRUE),
sqrt(Hmisc::wtd.var(
df_agg$life_satisfaction, df_gmh$ps_weight, na.rm = TRUE)))
),
tibble::tibble(
Variable = "Age (range 18–100)",
Unweighted =
sprintf("%.2f (%.2f)",
base::mean(df_agg$age, na.rm = TRUE),
stats::sd(df_agg$age, na.rm = TRUE)),
Weighted =
sprintf("%.2f (%.2f)",
Hmisc::wtd.mean(
df_agg$age, df_gmh$ps_weight, na.rm = TRUE),
sqrt(Hmisc::wtd.var(
df_agg$age, df_gmh$ps_weight, na.rm = TRUE)))
),
tibble::tibble(
Variable = "PHQ-4 sum (range 4-28)",
Unweighted =
sprintf("%.2f (%.2f)",
base::mean(df_agg$phq4_sum, na.rm = TRUE),
stats::sd(df_agg$phq4_sum, na.rm = TRUE)),
Weighted =
sprintf("%.2f (%.2f)",
Hmisc::wtd.mean(
df_agg$phq4_sum, df_gmh$ps_weight, na.rm = TRUE),
sqrt(Hmisc::wtd.var(
df_agg$phq4_sum, df_gmh$ps_weight, na.rm = TRUE)))
),
tibble::tibble(Variable = "", Unweighted = "", Weighted = ""),
tibble::tibble(Variable = "", Unweighted = "Md (IQR)", Weighted = "Md (IQR)"),
tibble::tibble(
Variable = "Duration (minutes)",
Unweighted = sprintf("%.2f (%.2f)", dur_Md_u, dur_IQR_u),
Weighted = "\u2014"
),
tibble::tibble(Variable = "", Unweighted = "", Weighted = ""),
tbl_block(df_agg, "income_merg_cat", "Household income", show_header = TRUE),
tbl_block(df_agg, "household_size_group", "Household size"),
tbl_block(df_agg, "age_group", "Age group"),
tbl_block(df_agg, "sex_reviewed_cat", "Sex"),
tbl_block(df_agg, "education_recoded_cat", "Education level"),
tbl_block(df_agg, "employment_primary", "Employment status"),
tbl_block(df_agg, "citizenship_cat", "Citizenship"),
tbl_block(df_agg, "childhood_SES_cat", "Childhood socioeconomic status", drop_na = TRUE),
tbl_block(df_agg, "work_arrangement_cat", "Work arrangement", drop_na = TRUE),
tbl_block(df_agg, "fin_outlook_cat", "Financial outlook", drop_na = TRUE),
tbl_block(df_agg, "attention_care_cat", "Attention and Care", drop_na = TRUE)
)
gt_table2 <- gt::gt(table2) |>
gt::cols_label(
Variable = "Variable",
Unweighted = "Unweighted",
Weighted = "Weighted"
) |>
gt::tab_options(
data_row.padding = gt::px(4),
table.border.top.color = "black",
table.border.top.width = gt::px(0.5),
table.border.bottom.color = "black",
table.border.bottom.width = gt::px(0.5),
heading.border.bottom.color = "black",
heading.border.bottom.width = gt::px(0.5),
table_body.hlines.color = "white",
row.striping.include_table_body = FALSE
) |>
gt::tab_style(
style = gt::cell_text(style = "italic"),
locations = gt::cells_body(
rows =
Unweighted %in% c("M (SD)", "Md (IQR)", "n (%)") |
Weighted %in% c("M (SD)", "Md (IQR)", "n (%)"),
columns = c("Unweighted", "Weighted")
)
); gt_table2| Variable | Unweighted | Weighted |
|---|---|---|
| M (SD) | M (SD) | |
| MPWB sum (range 10–70) | 47.43 (10.94) | 46.87 (11.09) |
| Life satisfaction (range 1–7) | 6.42 (2.38) | 6.30 (2.46) |
| Md (IQR) | Md (IQR) | |
| Duration (minutes) | 6.17 (4.52) | — |
| Household income | n (%) | n (%) |
| No income | 1948 (3.6%) | 1509 (4.1%) |
| Second decile | 6194 (11.5%) | 4848 (13.0%) |
| Third decile | 6583 (12.2%) | 4839 (13.0%) |
| Fourth decile | 6675 (12.4%) | 4518 (12.2%) |
| Fifth decile | 6360 (11.8%) | 4342 (11.7%) |
| Sixth decile | 5637 (10.5%) | 3870 (10.4%) |
| Seventh decile | 5001 (9.3%) | 3239 (8.7%) |
| Eighth decile | 4880 (9.1%) | 3195 (8.6%) |
| Ninth decile | 3793 (7.1%) | 2385 (6.4%) |
| Tenth decile | 6011 (11.2%) | 3880 (10.4%) |
| Removed | 717 (1.3%) | 551 (1.5%) |
| Household size | ||
| 1 | 13846 (25.7%) | 10113 (27.2%) |
| 2 | 13726 (25.5%) | 9900 (26.6%) |
| 3 | 9031 (16.8%) | 5974 (16.1%) |
| 4-5 | 13115 (24.4%) | 8494 (22.8%) |
| 6-20 | 4081 (7.6%) | 2693 (7.2%) |
| Age group | ||
| 18-25 | 11550 (21.5%) | 8195 (22.0%) |
| 26-44 | 28059 (52.2%) | 15684 (42.2%) |
| 45-64 | 11970 (22.2%) | 10118 (27.2%) |
| 65-74 | 1776 (3.3%) | 2555 (6.9%) |
| 75+ | 444 (0.8%) | 623 (1.7%) |
| Sex | ||
| Male | 20738 (38.5%) | 16605 (44.7%) |
| Female | 32607 (60.6%) | 20116 (54.1%) |
| Other | 352 (0.7%) | 352 (0.9%) |
| Removed | 102 (0.2%) | 102 (0.3%) |
| Education level | ||
| Less than secondary | 1346 (2.5%) | 2190 (5.9%) |
| Secondary | 10226 (19.0%) | 11556 (31.1%) |
| Technical | 5693 (10.6%) | 4602 (12.4%) |
| University | 19276 (35.8%) | 10454 (28.1%) |
| Advanced | 17258 (32.1%) | 8373 (22.5%) |
| Employment status | ||
| Not in paid employment (by choice/health) | 3263 (6.1%) | 2808 (7.6%) |
| Not in paid employment (looking for work) | 3883 (7.2%) | 2815 (7.6%) |
| Student non-working (Full or part-time) | 7122 (13.2%) | 4814 (12.9%) |
| Employed/working full-time (25+ hours per week) | 30478 (56.7%) | 18986 (51.1%) |
| Employed/working part-time (less than 25 hours per week) | 5961 (11.1%) | 4394 (11.8%) |
| Retired | 2288 (4.3%) | 2882 (7.8%) |
| Military service | 804 (1.5%) | 475 (1.3%) |
| Citizenship | ||
| Citizen | 49136 (91.3%) | 33763 (90.8%) |
| Non-citizen (Permanent Resident) | 1400 (2.6%) | 1064 (2.9%) |
| Born outside country (Citizen) | 1058 (2.0%) | 779 (2.1%) |
| Born outside country (Non-citizen, Permanent Resident) | 574 (1.1%) | 345 (0.9%) |
| Born outside country (Non-citizen, Non-permanent Resident) | 1631 (3.0%) | 1224 (3.3%) |
| Childhood socioeconomic status | ||
| Poor | 4379 (11.4%) | 3405 (12.8%) |
| Below average but not poor | 10022 (26.0%) | 7203 (27.1%) |
| Around average | 14523 (37.7%) | 9766 (36.7%) |
| Above average but not wealthy | 8439 (21.9%) | 5498 (20.7%) |
| Wealthy | 1115 (2.9%) | 715 (2.7%) |
| Work arrangement | ||
| I work entirely in-person (i.e., in an office, on-site) | 17150 (53.2%) | 11582 (55.3%) |
| I mostly work in-person, with occasional remote days | 5857 (18.2%) | 3572 (17.1%) |
| I work about evenly in-person/remote | 3375 (10.5%) | 2061 (9.8%) |
| I mostly work remotely, with occasional in-person days | 3077 (9.5%) | 1894 (9.0%) |
| I work entirely remotely | 2776 (8.6%) | 1827 (8.7%) |
# Kish Effective Sample Size
df_gmh |>
dplyr::summarise(
n_obs = dplyr::n(),
sum_w = base::sum(ps_weight, na.rm = TRUE),
sum_w2 = base::sum(ps_weight^2, na.rm = TRUE),
mean_w = base::mean(ps_weight, na.rm = TRUE),
sd_w = stats::sd(ps_weight, na.rm = TRUE)
) |>
dplyr::mutate(
kish_ess = (sum_w^2) / sum_w2,
cv_w = sd_w / mean_w,
deff = 1 + cv_w^2,
kish_from_n_deff = n_obs / deff
)# A tibble: 1 × 9
n_obs sum_w sum_w2 mean_w sd_w kish_ess cv_w deff kish_from_n_deff
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 53799 37175. 47006. 0.691 0.630 29400. 0.911 1.83 29399.
A5. Per-country Sample Characteristics
Show the code
# N total per country
df_n_total <- df_gmh |>
dplyr::count(country, name = "n_total")
# age_group
df_age_group <- df_gmh |>
dplyr::count(country, age_group, name = "n") |>
tidyr::pivot_wider(
names_from = age_group,
values_from = n,
values_fill = 0
) |>
dplyr::select(country, `18-25`, `26-44`, `45-64`, `65-74`, `75+`)
# childhood_SES_cat
df_childhood_ses <- df_gmh |>
dplyr::filter(!is.na(childhood_SES_cat)) |>
dplyr::count(country, childhood_SES_cat, name = "n") |>
tidyr::pivot_wider(
names_from = childhood_SES_cat,
values_from = n,
values_fill = 0
)
# citizenship_cat
df_citizenship <- df_gmh |>
dplyr::count(country, citizenship_cat, name = "n") |>
tidyr::pivot_wider(
names_from = citizenship_cat,
values_from = n,
values_fill = 0
)
# education_recoded_cat
df_education <- df_gmh |>
dplyr::filter(!is.na(education_recoded_cat)) |>
dplyr::count(country, education_recoded_cat, name = "n") |>
tidyr::pivot_wider(
names_from = education_recoded_cat,
values_from = n,
values_fill = 0
) |>
dplyr::select(country, `Less than secondary`,
Secondary, Technical, University, Advanced)
# employment_primary
df_employment <- df_gmh |>
dplyr::filter(!is.na(employment_primary)) |>
dplyr::count(country, employment_primary, name = "n") |>
tidyr::pivot_wider(
names_from = employment_primary,
values_from = n,
values_fill = 0
)
# sex_reviewed_cat
df_sex <- df_gmh |>
dplyr::filter(!is.na(sex_reviewed_cat)) |>
dplyr::count(country, sex_reviewed_cat, name = "n") |>
tidyr::pivot_wider(
names_from = sex_reviewed_cat,
values_from = n,
values_fill = 0
)
# household size
df_household_size <- df_gmh |>
dplyr::count(country, household_size_group, name = "n") |>
tidyr::pivot_wider(
names_from = household_size_group,
values_from = n,
values_fill = 0
)
# financial outlook
df_fin_outlook <- df_gmh |>
dplyr::filter(!is.na(fin_outlook_cat)) |>
dplyr::count(country, fin_outlook_cat, name = "n") |>
tidyr::pivot_wider(
names_from = fin_outlook_cat,
values_from = n,
values_fill = 0
)
# work_arrangement_cat
df_work_arrangement <- df_gmh |>
dplyr::filter(!is.na(work_arrangement_cat)) |>
dplyr::count(country, work_arrangement_cat, name = "n") |>
tidyr::pivot_wider(
names_from = work_arrangement_cat,
values_from = n,
values_fill = 0
)
# sponsorship
df_sponsorship <- df_gmh |>
dplyr::mutate(
sponsorship = case_when(
sponsored == 1 ~ "Sponsored",
sponsored == 0 ~ "Non-sponsored",
TRUE ~ NA_character_
)
) |>
dplyr::count(country, sponsorship, name = "n") |>
tidyr::pivot_wider(
names_from = sponsorship,
values_from = n,
values_fill = 0
)
# income_merg_cat
# remove rows with "Student non-working (Full or part-time)" in employment_primary
df_income <- df_gmh |>
dplyr::filter(
employment_primary != "Student non-working (Full or part-time)" &
!is.na(income_merg_cat)
) |>
dplyr::count(country, income_merg_cat, name = "n") |>
tidyr::pivot_wider(
names_from = income_merg_cat,
values_from = n,
values_fill = 0
) |>
dplyr::select(country, `No income`,
`Second decile`, `Third decile`,
`Fourth decile`, `Fifth decile`, `Sixth decile`,
`Seventh decile`, `Eighth decile`,
`Ninth decile`, `Tenth decile`)
# join everything by country
df_demo_counts <- df_n_total |>
dplyr::full_join(df_age_group, by = "country") |>
dplyr::full_join(df_childhood_ses, by = "country") |>
dplyr::full_join(df_citizenship, by = "country") |>
dplyr::full_join(df_education, by = "country") |>
dplyr::full_join(df_employment, by = "country") |>
dplyr::full_join(df_sex, by = "country") |>
dplyr::full_join(df_household_size, by = "country") |>
dplyr::full_join(df_fin_outlook, by = "country") |>
dplyr::full_join(df_work_arrangement, by = "country") |>
dplyr::full_join(df_income, by = "country") |>
dplyr::full_join(df_sponsorship, by = "country")
write_csv(df_demo_counts, "222_countries_demographics_raw.csv")
reactable::reactable(
df_demo_counts,
pagination = FALSE,
height = 650,
width = 800,
defaultSorted = "country",
defaultSortOrder = "asc",
searchable = TRUE,
striped = TRUE,
compact = TRUE,
highlight = TRUE,
columnGroups = list(
colGroup(
name = "Age group",
columns = colnames(df_demo_counts)[3:7]),
colGroup(
name = "Childhood SES",
columns = colnames(df_demo_counts)[8:12]),
colGroup(
name = "Citizenship status",
columns = colnames(df_demo_counts)[13:17]),
colGroup(
name = "Education level",
columns = colnames(df_demo_counts)[18:22]),
colGroup(
name = "Employmnet status",
columns = colnames(df_demo_counts)[23:29]),
colGroup(
name = "Sex",
columns = colnames(df_demo_counts)[30:32]),
colGroup(
name = "Household size",
columns = colnames(df_demo_counts)[33:37]),
colGroup(
name = "Financial outlook",
columns = colnames(df_demo_counts)[38:42]),
colGroup(
name = "Work arrangement",
columns = colnames(df_demo_counts)[43:47]),
colGroup(
name = "Income deciles (student non-working excluded)",
columns = colnames(df_demo_counts)[48:57]),
colGroup(
name = "Sponsorship status",
columns = colnames(df_demo_counts)[58:59])
),
defaultColGroup = reactable::colGroup(headerVAlign = "bottom"),
defaultColDef = reactable::colDef(
filterable = FALSE,
vAlign = "center",
headerVAlign = "bottom",
class = "cell",
headerClass = "header",
headerStyle = list(fontSize = "13px"),
style = list(fontSize = "13px")),
columns = list(
country = reactable::colDef(
name = "Country",
sticky = "left",
width = 100
),
n_total = reactable::colDef(
name = "<em>N</em><sub>total</sub>",
html = TRUE,
width = 50
)
)
)Show the code
# N total per country
df_n_total <- df_gmh |>
dplyr::count(country, name = "n_total")
# age_group
df_age_group <- df_gmh |>
dplyr::count(country, age_group, name = "n") |>
dplyr::group_by(country) |>
dplyr::mutate(pct = base::round(100 * n / sum(n), 1)) |>
dplyr::ungroup() |>
tidyr::pivot_wider(
id_cols = country,
names_from = age_group,
values_from = pct,
values_fill = 0
) |>
dplyr::select(country, `18-25`, `26-44`, `45-64`, `65-74`, `75+`)
# childhood_SES_cat
df_childhood_ses <- df_gmh |>
dplyr::count(country, childhood_SES_cat, name = "n") |>
dplyr::group_by(country) |>
dplyr::mutate(pct = base::round(100 * n / sum(n), 1)) |>
dplyr::ungroup() |>
tidyr::pivot_wider(
id_cols = country,
names_from = childhood_SES_cat,
values_from = pct,
values_fill = 0
)
# citizenship_cat
df_citizenship <- df_gmh |>
dplyr::count(country, citizenship_cat, name = "n") |>
dplyr::group_by(country) |>
dplyr::mutate(pct = base::round(100 * n / sum(n), 1)) |>
dplyr::ungroup() |>
tidyr::pivot_wider(
id_cols = country,
names_from = citizenship_cat,
values_from = pct,
values_fill = 0
)
# education_recoded_cat
df_education <- df_gmh |>
dplyr::filter(!is.na(education_recoded_cat)) |>
dplyr::count(country, education_recoded_cat, name = "n") |>
dplyr::group_by(country) |>
dplyr::mutate(pct = base::round(100 * n / sum(n), 1)) |>
dplyr::ungroup() |>
tidyr::pivot_wider(
id_cols = country,
names_from = education_recoded_cat,
values_from = pct,
values_fill = 0
) |>
dplyr::select(
country,
`Less than secondary`,
Secondary,
Technical,
University,
Advanced
)
# employment_primary
df_employment <- df_gmh |>
dplyr::filter(!is.na(employment_primary)) |>
dplyr::count(country, employment_primary, name = "n") |>
dplyr::group_by(country) |>
dplyr::mutate(pct = base::round(100 * n / sum(n), 1)) |>
dplyr::ungroup() |>
tidyr::pivot_wider(
id_cols = country,
names_from = employment_primary,
values_from = pct,
values_fill = 0
)
# sex_reviewed_cat
df_sex <- df_gmh |>
dplyr::filter(!is.na(sex_reviewed_cat)) |>
dplyr::count(country, sex_reviewed_cat, name = "n") |>
dplyr::group_by(country) |>
dplyr::mutate(pct = base::round(100 * n / sum(n), 1)) |>
dplyr::ungroup() |>
tidyr::pivot_wider(
id_cols = country,
names_from = sex_reviewed_cat,
values_from = pct,
values_fill = 0
)
# household_size_group
df_household_size <- df_gmh |>
dplyr::count(country, household_size_group, name = "n") |>
dplyr::group_by(country) |>
dplyr::mutate(pct = base::round(100 * n / sum(n), 1)) |>
dplyr::ungroup() |>
tidyr::pivot_wider(
id_cols = country,
names_from = household_size_group,
values_from = pct,
values_fill = 0
)
# fin_outlook_cat
df_fin_outlook <- df_gmh |>
dplyr::filter(!is.na(fin_outlook_cat)) |>
dplyr::count(country, fin_outlook_cat, name = "n") |>
dplyr::group_by(country) |>
dplyr::mutate(pct = base::round(100 * n / sum(n), 1)) |>
dplyr::ungroup() |>
tidyr::pivot_wider(
id_cols = country,
names_from = fin_outlook_cat,
values_from = pct,
values_fill = 0
)
# work_arrangement_cat
df_work_arrangement <- df_gmh |>
dplyr::filter(!is.na(work_arrangement_cat)) |>
dplyr::count(country, work_arrangement_cat, name = "n") |>
dplyr::group_by(country) |>
dplyr::mutate(pct = base::round(100 * n / sum(n), 1)) |>
dplyr::ungroup() |>
tidyr::pivot_wider(
id_cols = country,
names_from = work_arrangement_cat,
values_from = pct,
values_fill = 0
)
# sponsorship
df_sponsorship <- df_gmh |>
dplyr::mutate(
sponsorship = dplyr::case_when(
sponsored == 1 ~ "Sponsored",
sponsored == 0 ~ "Non-sponsored",
TRUE ~ NA_character_
)
) |>
dplyr::filter(!is.na(sponsorship)) |>
dplyr::count(country, sponsorship, name = "n") |>
dplyr::group_by(country) |>
dplyr::mutate(pct = base::round(100 * n / sum(n), 1)) |>
dplyr::ungroup() |>
tidyr::pivot_wider(
id_cols = country,
names_from = sponsorship,
values_from = pct,
values_fill = 0
)
# income_merg_cat (student non-working excluded)
df_income <- df_gmh |>
dplyr::filter(
employment_primary != "Student non-working (Full or part-time)" &
!is.na(income_merg_cat)
) |>
dplyr::count(country, income_merg_cat, name = "n") |>
dplyr::group_by(country) |>
dplyr::mutate(pct = base::round(100 * n / sum(n), 1)) |>
dplyr::ungroup() |>
tidyr::pivot_wider(
id_cols = country,
names_from = income_merg_cat,
values_from = pct,
values_fill = 0
) |>
dplyr::select(
country,
`No income`,
`Second decile`,
`Third decile`,
`Fourth decile`,
`Fifth decile`,
`Sixth decile`,
`Seventh decile`,
`Eighth decile`,
`Ninth decile`,
`Tenth decile`
)
# join everything by country
df_demo_pct <- df_n_total |>
dplyr::full_join(df_age_group, by = "country") |>
dplyr::full_join(df_childhood_ses, by = "country") |>
dplyr::full_join(df_citizenship, by = "country") |>
dplyr::full_join(df_education, by = "country") |>
dplyr::full_join(df_employment, by = "country") |>
dplyr::full_join(df_sex, by = "country") |>
dplyr::full_join(df_household_size, by = "country") |>
dplyr::full_join(df_fin_outlook, by = "country") |>
dplyr::full_join(df_work_arrangement, by = "country") |>
dplyr::full_join(df_income, by = "country") |>
dplyr::full_join(df_sponsorship, by = "country")
reactable::reactable(
df_demo_pct,
pagination = FALSE,
height = 650,
width = 800,
defaultSorted = "country",
defaultSortOrder = "asc",
searchable = TRUE,
striped = TRUE,
compact = TRUE,
highlight = TRUE,
columnGroups = list(
colGroup(
name = "Age group",
columns = colnames(df_demo_pct)[3:7]
),
colGroup(
name = "Childhood SES",
columns = colnames(df_demo_pct)[8:12]
),
colGroup(
name = "Citizenship status",
columns = colnames(df_demo_pct)[13:17]
),
colGroup(
name = "Education level",
columns = colnames(df_demo_pct)[18:22]
),
colGroup(
name = "Employmnet status",
columns = colnames(df_demo_pct)[23:29]
),
colGroup(
name = "Sex",
columns = colnames(df_demo_pct)[30:32]
),
colGroup(
name = "Household size",
columns = colnames(df_demo_pct)[33:37]
),
colGroup(
name = "Financial outlook",
columns = colnames(df_demo_pct)[38:42]
),
colGroup(
name = "Work arrangement",
columns = colnames(df_demo_pct)[43:47]
),
colGroup(
name = "Income deciles (student non-working excluded)",
columns = colnames(df_demo_pct)[48:57]
),
colGroup(
name = "Sponsorship status",
columns = colnames(df_demo_pct)[58:59]
)
),
defaultColGroup = reactable::colGroup(headerVAlign = "bottom"),
defaultColDef = reactable::colDef(
filterable = FALSE,
vAlign = "center",
headerVAlign = "bottom",
class = "cell",
headerClass = "header",
headerStyle = list(fontSize = "13px"),
style = list(fontSize = "13px")),
columns = list(
country = reactable::colDef(
name = "Country",
sticky = "left",
width = 100
),
n_total = reactable::colDef(
name = "<em>N</em><sub>total</sub>",
html = TRUE,
width = 50
)
)
)Show the code
# A tibble: 1 × 2
country Female
<chr> <dbl>
1 Estonia 90.3
Show the code
# A tibble: 1 × 2
country Female
<chr> <dbl>
1 Chad 22.6
Show the code
Warning in rm(df_demo_counts, df_n_total, df_age_group, df_childhood_ses, : object 'df_demo_counts' not found
Show the code
country_avg <- df_gmh |>
dplyr::group_by(country) |>
dplyr::summarise(
mean_age = base::round(base::mean(age, na.rm = TRUE), 1),
sd_age = base::round(stats::sd(age, na.rm = TRUE), 1),
mean_mpwb_sum = base::round(base::mean(mpwb_sum, na.rm = TRUE), 1),
sd_mpwb_sum = base::round(stats::sd(mpwb_sum, na.rm = TRUE), 1),
mean_phq4_sum = base::round(base::mean(phq4_sum, na.rm = TRUE), 1),
sd_phq4_sum = base::round(stats::sd(phq4_sum, na.rm = TRUE), 1),
mean_ls = base::round(base::mean(life_satisfaction, na.rm = TRUE), 1),
sd_ls = base::round(stats::sd(life_satisfaction, na.rm = TRUE), 1),
mean_assets_USD = base::round(base::mean(assets_USD, na.rm = TRUE), 1),
sd_assets_USD = base::round(stats::sd(assets_USD, na.rm = TRUE), 1),
mean_debts_USD = base::round(base::mean(debts_USD, na.rm = TRUE), 1),
sd_debts_USD = base::round(stats::sd(debts_USD, na.rm = TRUE), 1),
mean_income_USD =
base::round(base::mean(income_annual_24_gross_USD, na.rm = TRUE), 1),
sd_income_USD =
base::round(stats::sd(income_annual_24_gross_USD, na.rm = TRUE), 1),
)
reactable::reactable(
country_avg,
pagination = FALSE,
searchable = TRUE,
striped = TRUE,
highlight = TRUE,
compact = TRUE,
class = "avg_tbl",
columnGroups = list(
colGroup(name = "Age", columns = c("mean_age","sd_age")),
colGroup(name = "MPWB sum", columns = c("mean_mpwb_sum","sd_mpwb_sum")),
colGroup(name = "PHQ-4 sum", columns = c("mean_phq4_sum","sd_phq4_sum")),
colGroup(name = "Life satisfaction", columns = c("mean_ls","sd_ls")),
colGroup(name = "Assets USD", columns = c("mean_assets_USD","sd_assets_USD")),
colGroup(name = "Debts USD", columns = c("mean_debts_USD","sd_debts_USD")),
colGroup(name = "Income USD", columns = c("mean_income_USD","sd_income_USD"))
),
defaultColDef = colDef(
headerVAlign = "bottom",
vAlign = "center",
headerStyle = list(fontSize = "12px"),
style = list(fontSize = "12px"),
filterable = FALSE
),
columns = list(
country = colDef(
name = "Country",
sticky = "left",
minWidth = 100
),
mean_age = colDef(name = "<i>M</i>", html = TRUE),
sd_age = colDef(name = "<i>SD</i>", html = TRUE),
mean_mpwb_sum = colDef(name = "<i>M</i>", html = TRUE),
sd_mpwb_sum = colDef(name = "<i>SD</i>", html = TRUE),
mean_phq4_sum = colDef(name = "<i>M</i>", html = TRUE),
sd_phq4_sum = colDef(name = "<i>SD</i>", html = TRUE),
mean_ls = colDef(name = "<i>M</i>", html = TRUE),
sd_ls = colDef(name = "<i>SD</i>", html = TRUE),
mean_assets_USD = colDef(name = "<i>M</i>", html = TRUE),
sd_assets_USD = colDef(name = "<i>SD</i>", html = TRUE),
mean_debts_USD = colDef(name = "<i>M</i>", html = TRUE),
sd_debts_USD = colDef(name = "<i>SD</i>", html = TRUE),
mean_income_USD = colDef(name = "<i>M</i>", html = TRUE),
sd_income_USD = colDef(name = "<i>SD</i>", html = TRUE)
)
)Show the code
# N total per country
df_n_total <- df_gmh |>
dplyr::group_by(country) |>
dplyr::summarise(
n_total = base::round(base::sum(ps_weight, na.rm = TRUE), 0)
)
# age_group
df_age_group <- df_gmh |>
dplyr::group_by(country, age_group) |>
dplyr::summarise(
n = base::round(base::sum(ps_weight, na.rm = TRUE), 0)
) |>
tidyr::pivot_wider(
names_from = age_group,
values_from = n,
values_fill = 0
) |>
dplyr::select(country, `18-25`, `26-44`, `45-64`, `65-74`, `75+`)
# childhood_SES_cat
df_childhood_ses <- df_gmh |>
dplyr::group_by(country, childhood_SES_cat) |>
dplyr::summarise(
n = base::round(base::sum(ps_weight, na.rm = TRUE), 0)
) |>
tidyr::pivot_wider(
names_from = childhood_SES_cat,
values_from = n,
values_fill = 0
)
# citizenship_cat
df_citizenship <- df_gmh |>
dplyr::group_by(country, citizenship_cat) |>
dplyr::summarise(
n = base::round(base::sum(ps_weight, na.rm = TRUE), 0)
) |>
tidyr::pivot_wider(
names_from = citizenship_cat,
values_from = n,
values_fill = 0
)
# education_recoded_cat
df_education <- df_gmh |>
dplyr::filter(!base::is.na(education_recoded_cat)) |>
dplyr::group_by(country, education_recoded_cat) |>
dplyr::summarise(
n = base::round(base::sum(ps_weight, na.rm = TRUE), 0)
) |>
tidyr::pivot_wider(
names_from = education_recoded_cat,
values_from = n,
values_fill = 0
) |>
dplyr::select(
country,
`Less than secondary`,
Secondary,
Technical,
University,
Advanced
)
# employment_primary
df_employment <- df_gmh |>
dplyr::filter(!base::is.na(employment_primary)) |>
dplyr::group_by(country, employment_primary) |>
dplyr::summarise(
n = base::round(base::sum(ps_weight, na.rm = TRUE), 0)
) |>
tidyr::pivot_wider(
names_from = employment_primary,
values_from = n,
values_fill = 0
)
# sex_reviewed_cat
df_sex <- df_gmh |>
dplyr::filter(!base::is.na(sex_reviewed_cat)) |>
dplyr::group_by(country, sex_reviewed_cat) |>
dplyr::summarise(
n = base::round(base::sum(ps_weight, na.rm = TRUE), 0)
) |>
tidyr::pivot_wider(
names_from = sex_reviewed_cat,
values_from = n,
values_fill = 0
)
# household size
df_household_size <- df_gmh |>
dplyr::group_by(country, household_size_group) |>
dplyr::summarise(
n = base::round(base::sum(ps_weight, na.rm = TRUE), 0)
) |>
tidyr::pivot_wider(
names_from = household_size_group,
values_from = n,
values_fill = 0
)
# financial outlook
df_fin_outlook <- df_gmh |>
dplyr::filter(!base::is.na(fin_outlook_cat)) |>
dplyr::group_by(country, fin_outlook_cat) |>
dplyr::summarise(
n = base::round(base::sum(ps_weight, na.rm = TRUE), 0)
) |>
tidyr::pivot_wider(
names_from = fin_outlook_cat,
values_from = n,
values_fill = 0
)
# work_arrangement_cat
df_work_arrangement <- df_gmh |>
dplyr::filter(!base::is.na(work_arrangement_cat)) |>
dplyr::group_by(country, work_arrangement_cat) |>
dplyr::summarise(
n = base::round(base::sum(ps_weight, na.rm = TRUE), 0)
) |>
tidyr::pivot_wider(
names_from = work_arrangement_cat,
values_from = n,
values_fill = 0
)
# income_merg_cat
# remove rows with "Student non-working (Full or part-time)" in employment_primary
df_income <- df_gmh |>
dplyr::filter(
employment_primary != "Student non-working (Full or part-time)" &
!base::is.na(income_merg_cat)
) |>
dplyr::group_by(country, income_merg_cat) |>
dplyr::summarise(
n = base::round(base::sum(ps_weight, na.rm = TRUE), 0)
) |>
tidyr::pivot_wider(
names_from = income_merg_cat,
values_from = n,
values_fill = 0
) |>
dplyr::select(
country,
`No income`,
`Second decile`,
`Third decile`,
`Fourth decile`,
`Fifth decile`,
`Sixth decile`,
`Seventh decile`,
`Eighth decile`,
`Ninth decile`,
`Tenth decile`
)
# join everything by country
df_demo_countw <- df_n_total |>
dplyr::full_join(df_age_group, by = "country") |>
dplyr::full_join(df_childhood_ses, by = "country") |>
dplyr::full_join(df_citizenship, by = "country") |>
dplyr::full_join(df_education, by = "country") |>
dplyr::full_join(df_employment, by = "country") |>
dplyr::full_join(df_sex, by = "country") |>
dplyr::full_join(df_household_size, by = "country") |>
dplyr::full_join(df_fin_outlook, by = "country") |>
dplyr::full_join(df_work_arrangement, by = "country") |>
dplyr::full_join(df_income, by = "country")
reactable::reactable(
df_demo_countw,
pagination = FALSE,
height = 650,
width = 800,
defaultSorted = "country",
defaultSortOrder = "asc",
searchable = TRUE,
striped = TRUE,
compact = TRUE,
highlight = TRUE,
columnGroups = list(
colGroup(
name = "Age group",
columns = colnames(df_demo_countw)[3:7]
),
colGroup(
name = "Childhood SES",
columns = colnames(df_demo_countw)[8:12]
),
colGroup(
name = "Citizenship status",
columns = colnames(df_demo_countw)[13:17]
),
colGroup(
name = "Education level",
columns = colnames(df_demo_countw)[18:22]
),
colGroup(
name = "Employmnet status",
columns = colnames(df_demo_countw)[23:29]
),
colGroup(
name = "Sex",
columns = colnames(df_demo_countw)[30:32]
),
colGroup(
name = "Household size",
columns = colnames(df_demo_countw)[33:37]
),
colGroup(
name = "Financial outlook",
columns = colnames(df_demo_countw)[38:42]
),
colGroup(
name = "Work arrangement",
columns = colnames(df_demo_countw)[43:47]
),
colGroup(
name = "Income deciles (student non-working excluded)",
columns = colnames(df_demo_countw)[48:57]
)
),
defaultColGroup = reactable::colGroup(headerVAlign = "bottom"),
defaultColDef = reactable::colDef(
filterable = FALSE,
vAlign = "center",
headerVAlign = "bottom",
class = "cell",
headerClass = "header",
headerStyle = list(fontSize = "13px"),
style = list(fontSize = "13px")),
columns = list(
country = reactable::colDef(
name = "Country",
sticky = "left",
width = 100
),
n_total = reactable::colDef(
name = "<em>N</em><sub>total</sub>",
html = TRUE,
width = 50
)
)
)Show the code
# A tibble: 1 × 2
country Female
<chr> <dbl>
1 USA 1883
Show the code
# A tibble: 1 × 2
country Female
<chr> <dbl>
1 Chad 3
Show the code
Warning in rm(df_demo_counts, df_n_total, df_age_group, df_childhood_ses, : object 'df_demo_counts' not found
Warning in rm(df_demo_counts, df_n_total, df_age_group, df_childhood_ses, : object 'df_sponsorship' not found
Show the code
# N total per country
df_n_total <- df_gmh |>
dplyr::group_by(country) |>
dplyr::summarise(
n_total = base::round(base::sum(ps_weight, na.rm = TRUE), 1)
)
# age_group
df_age_group <- df_gmh |>
dplyr::group_by(country, age_group) |>
dplyr::summarise(
wtd_n = base::sum(ps_weight, na.rm = TRUE)
) |>
dplyr::mutate(
pct = base::round(100 * wtd_n / base::sum(wtd_n), 1)
) |>
dplyr::ungroup() |>
tidyr::pivot_wider(
id_cols = country,
names_from = age_group,
values_from = pct,
values_fill = 0
) |>
dplyr::select(country, `18-25`, `26-44`, `45-64`, `65-74`, `75+`)
# childhood_SES_cat (weighted %)
df_childhood_ses <- df_gmh |>
dplyr::group_by(country, childhood_SES_cat) |>
dplyr::summarise(
wtd_n = base::sum(ps_weight, na.rm = TRUE)
) |>
dplyr::mutate(
pct = base::round(100 * wtd_n / base::sum(wtd_n), 1)
) |>
dplyr::ungroup() |>
tidyr::pivot_wider(
id_cols = country,
names_from = childhood_SES_cat,
values_from = pct,
values_fill = 0
)
# citizenship_cat
df_citizenship <- df_gmh |>
dplyr::group_by(country, citizenship_cat) |>
dplyr::summarise(
wtd_n = base::sum(ps_weight, na.rm = TRUE)
) |>
dplyr::mutate(
pct = base::round(100 * wtd_n / base::sum(wtd_n), 1)
) |>
dplyr::ungroup() |>
tidyr::pivot_wider(
id_cols = country,
names_from = citizenship_cat,
values_from = pct,
values_fill = 0
)
# education_recoded_cat
df_education <- df_gmh |>
dplyr::filter(!is.na(education_recoded_cat)) |>
dplyr::group_by(country, education_recoded_cat) |>
dplyr::summarise(
wtd_n = base::sum(ps_weight, na.rm = TRUE)
) |>
dplyr::mutate(
pct = base::round(100 * wtd_n / base::sum(wtd_n), 1)
) |>
dplyr::ungroup() |>
tidyr::pivot_wider(
id_cols = country,
names_from = education_recoded_cat,
values_from = pct,
values_fill = 0
) |>
dplyr::select(
country,
`Less than secondary`,
Secondary,
Technical,
University,
Advanced
)
# employment_primary
df_employment <- df_gmh |>
dplyr::filter(!is.na(employment_primary)) |>
dplyr::group_by(country, employment_primary) |>
dplyr::summarise(
wtd_n = base::sum(ps_weight, na.rm = TRUE)
) |>
dplyr::mutate(
pct = base::round(100 * wtd_n / base::sum(wtd_n), 1)
) |>
dplyr::ungroup() |>
tidyr::pivot_wider(
id_cols = country,
names_from = employment_primary,
values_from = pct,
values_fill = 0
)
# sex_reviewed_cat
df_sex <- df_gmh |>
dplyr::group_by(country, sex_reviewed_cat) |>
dplyr::summarise(
wtd_n = base::sum(ps_weight, na.rm = TRUE)
) |>
dplyr::mutate(
pct = base::round(100 * wtd_n / base::sum(wtd_n), 1)
) |>
dplyr::ungroup() |>
tidyr::pivot_wider(
id_cols = country,
names_from = sex_reviewed_cat,
values_from = pct,
values_fill = 0
)
# household_size_group
df_household_size <- df_gmh |>
dplyr::group_by(country, household_size_group) |>
dplyr::summarise(
wtd_n = base::sum(ps_weight, na.rm = TRUE)
) |>
dplyr::mutate(
pct = base::round(100 * wtd_n / base::sum(wtd_n), 1)
) |>
dplyr::ungroup() |>
tidyr::pivot_wider(
id_cols = country,
names_from = household_size_group,
values_from = pct,
values_fill = 0
)
# fin_outlook_cat
df_fin_outlook <- df_gmh |>
dplyr::filter(!is.na(fin_outlook_cat)) |>
dplyr::group_by(country, fin_outlook_cat) |>
dplyr::summarise(
wtd_n = base::sum(ps_weight, na.rm = TRUE)
) |>
dplyr::mutate(
pct = base::round(100 * wtd_n / base::sum(wtd_n), 1)
) |>
dplyr::ungroup() |>
tidyr::pivot_wider(
id_cols = country,
names_from = fin_outlook_cat,
values_from = pct,
values_fill = 0
)
# work_arrangement_cat
df_work_arrangement <- df_gmh |>
dplyr::filter(!is.na(work_arrangement_cat)) |>
dplyr::group_by(country, work_arrangement_cat) |>
dplyr::summarise(
wtd_n = base::sum(ps_weight, na.rm = TRUE)
) |>
dplyr::mutate(
pct = base::round(100 * wtd_n / base::sum(wtd_n), 1)
) |>
dplyr::ungroup() |>
tidyr::pivot_wider(
id_cols = country,
names_from = work_arrangement_cat,
values_from = pct,
values_fill = 0
)
# income_merg_cat (student non-working excluded)
df_income <- df_gmh |>
dplyr::filter(
employment_primary != "Student non-working (Full or part-time)" &
!is.na(income_merg_cat)
) |>
dplyr::group_by(country, income_merg_cat) |>
dplyr::summarise(
wtd_n = base::sum(ps_weight, na.rm = TRUE)
) |>
dplyr::mutate(
pct = base::round(100 * wtd_n / base::sum(wtd_n), 1)
) |>
dplyr::ungroup() |>
tidyr::pivot_wider(
id_cols = country,
names_from = income_merg_cat,
values_from = pct,
values_fill = 0
) |>
dplyr::select(
country,
`No income`,
`Second decile`,
`Third decile`,
`Fourth decile`,
`Fifth decile`,
`Sixth decile`,
`Seventh decile`,
`Eighth decile`,
`Ninth decile`,
`Tenth decile`
)
# join everything by country
df_demo_pctw <- df_n_total |>
dplyr::full_join(df_age_group, by = "country") |>
dplyr::full_join(df_childhood_ses, by = "country") |>
dplyr::full_join(df_citizenship, by = "country") |>
dplyr::full_join(df_education, by = "country") |>
dplyr::full_join(df_employment, by = "country") |>
dplyr::full_join(df_sex, by = "country") |>
dplyr::full_join(df_household_size, by = "country") |>
dplyr::full_join(df_fin_outlook, by = "country") |>
dplyr::full_join(df_work_arrangement, by = "country") |>
dplyr::full_join(df_income, by = "country")
reactable::reactable(
df_demo_pctw,
pagination = FALSE,
height = 650,
width = 800,
defaultSorted = "country",
defaultSortOrder = "asc",
searchable = TRUE,
striped = TRUE,
compact = TRUE,
highlight = TRUE,
columnGroups = list(
colGroup(
name = "Age group",
columns = colnames(df_demo_pctw)[3:7]
),
colGroup(
name = "Childhood SES",
columns = colnames(df_demo_pctw)[8:12]
),
colGroup(
name = "Citizenship status",
columns = colnames(df_demo_pctw)[13:17]
),
colGroup(
name = "Education level",
columns = colnames(df_demo_pctw)[18:22]
),
colGroup(
name = "Employmnet status",
columns = colnames(df_demo_pctw)[23:29]
),
colGroup(
name = "Sex",
columns = colnames(df_demo_pctw)[30:32]
),
colGroup(
name = "Household size",
columns = colnames(df_demo_pctw)[33:37]
),
colGroup(
name = "Financial outlook",
columns = colnames(df_demo_pctw)[38:42]
),
colGroup(
name = "Work arrangement",
columns = colnames(df_demo_pctw)[43:47]
),
colGroup(
name = "Income deciles (student non-working excluded)",
columns = colnames(df_demo_pctw)[48:57]
)
),
defaultColGroup = reactable::colGroup(headerVAlign = "bottom"),
defaultColDef = reactable::colDef(
filterable = FALSE,
vAlign = "center",
headerVAlign = "bottom",
class = "cell",
headerClass = "header",
headerStyle = list(fontSize = "13px"),
style = list(fontSize = "13px")),
columns = list(
country = reactable::colDef(
name = "Country",
sticky = "left",
width = 100
),
n_total = reactable::colDef(
name = "<em>N</em><sub>total</sub>",
html = TRUE,
width = 50
)
)
)Show the code
# A tibble: 1 × 2
country Female
<chr> <dbl>
1 Hungary 88.6
Show the code
# A tibble: 1 × 2
country Female
<chr> <dbl>
1 Chad 22.6
Show the code
Warning in rm(df_demo_counts, df_n_total, df_age_group, df_childhood_ses, : object 'df_demo_counts' not found
Warning in rm(df_demo_counts, df_n_total, df_age_group, df_childhood_ses, : object 'df_sponsorship' not found
A6. Geographic Distribution
# Sanity check: Any missing lat/long values?
df_gmh |>
dplyr::filter(is.na(lat) | is.na(long)) |>
base::nrow()[1] 0
# Extract geographical data and remove Antarctica for more efficient plotting
world <- rnaturalearth::ne_countries(scale = "medium", returnclass = "sf") |>
base::subset(name != "Antarctica")
# Create data frame with unweighted values
by_cty <- df_gmh |>
dplyr::group_by(country) |>
dplyr::summarise(
n = dplyr::n(),
pct_female = base::mean(sex_reviewed_cat == "Female", na.rm = TRUE) * 100,
pct_fulltime = base::mean(
employment_primary == "Employed/working full-time (25+ hours per week)",
na.rm = TRUE) * 100,
mean_age = base::mean(age, na.rm = TRUE),
median_dur_min = stats::median(duration_sec, na.rm = TRUE) / 60
) |>
arrange(desc(n))
total_row <- df_gmh |>
dplyr::summarise(
country = "Total",
n = dplyr::n(),
pct_female = base::mean(sex_reviewed_cat == "Female", na.rm = TRUE) * 100,
pct_fulltime = base::mean(
employment_primary == "Employed/working full-time (25+ hours per week)",
na.rm = TRUE) * 100,
mean_age = base::mean(age, na.rm = TRUE),
median_dur_min = stats::median(duration_sec, na.rm = TRUE) / 60
)
tbl_country <- dplyr::bind_rows(total_row, by_cty) |>
dplyr::mutate(
`% Female` = base::sprintf("%.1f%%", pct_female),
`% Full-time employed` = base::sprintf("%.1f%%", pct_fulltime),
`italic(M)[age]` = base::round(mean_age, 1),
`italic(Md)[duration]` = base::round(median_dur_min, 1)
) |>
dplyr::select(
Country = country,
`italic(n)` = n,
`% Female`,
`% Full-time employed`,
`italic(M)[age]`,
`italic(Md)[duration]`
)
# Table aesthetics
tt <- gridExtra::ttheme_minimal(
core = list(
fg_params = list(
hjust = 0,
x = 0.02,
fontsize = 12.5, fontfamily = "Inter",
col = "#051520"
),
padding = unit(c(7.5, 2.2), "pt")
),
colhead = list(
fg_params = list(
hjust = 0,
x = 0.02,
fontsize = 12.7,
fontface = "plain",
col = "#051520",
parse = TRUE
),
bg_params = list(fill = NA)
)
)
# Create table grob
tbl_grob <-
gridExtra::tableGrob(tbl_country,
rows = NULL,
theme = tt)
tbl_grob <- gtable::gtable_add_grob(
tbl_grob,
grobs = grid::segmentsGrob(
x0 = grid::unit(0, "npc"),
x1 = grid::unit(1, "npc"),
y0 = grid::unit(1, "npc"),
y1 = grid::unit(1, "npc")
),
t = 1, l = 1, r = ncol(tbl_grob)
)
tbl_grob <- gtable::gtable_add_grob(
tbl_grob,
grobs = grid::segmentsGrob(
x0 = grid::unit(0, "npc"),
x1 = grid::unit(1, "npc"),
y0 = grid::unit(0, "npc"),
y1 = grid::unit(0, "npc")
),
t = nrow(tbl_grob), l = 1, r = ncol(tbl_grob)
)
tbl_grob$widths[[2]] <- tbl_grob$widths[[2]] * 1.9
# Add label "B" to the table block
tables_block <- cowplot::ggdraw(ggplotify::as.ggplot(tbl_grob)) +
draw_label("B", x = 0.002, y = 0.992,
hjust = 0, vjust = 1, fontface = "bold",
size = 30, fontfamily = "Inter", color = "#051520")Warning: `aes_()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`
ℹ The deprecated feature was likely used in the ggplotify package.
Please report the issue at <https://github.com/GuangchuangYu/ggplotify/issues>.
# Plot aesthetics
map_theme <- ggplot2::theme_minimal(base_family = "Inter", base_size = 20) +
ggplot2::theme(
text = element_text(family = "Inter", colour = "#051520"),
panel.grid = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
panel.background = element_rect(fill = "transparent", colour = NA),
plot.background = element_rect(fill = "transparent", colour = NA))
p_world <- ggplot2::ggplot() +
ggplot2::geom_sf(data = world, linewidth = 0.2, fill = "#ebedf0", color = "#a3a9bd") +
ggplot2::geom_point(data = df_gmh, aes(long, lat), alpha = 0.45,
size = 0.3, color="#082444") +
ggplot2::coord_sf(expand = FALSE) +
ggplot2::labs(title = "A\n") +
map_theme +
ggplot2::theme(plot.title = element_text(hjust = 0.02, face = "bold", size = 30,
colour = "#051520", family = "Inter"))
# Zoom Southeast Asia
sea_xlim <- c(90, 135)
sea_ylim <- c(-15, 25)
# Zoom Europe
eu_xlim <- c(-25, 45)
eu_ylim <- c(34, 72)
# Zoom East Asia
ea_xlim <- c(110, 150)
ea_ylim <- c(20, 50)
zoom_plot <- function(xlim, ylim, title_txt) {
ggplot2::ggplot() +
ggplot2::geom_sf(data = world, linewidth = 0.2,
fill = "#ebedf0", color = "#a3a9bd") +
ggplot2::geom_point(
data = df_gmh |> dplyr::filter(long >= xlim[1], long <= xlim[2],
lat >= ylim[1], lat <= ylim[2]),
aes(long, lat), alpha = 0.55, size = 0.3, color="#082444"
) +
ggplot2::coord_sf(xlim = xlim, ylim = ylim, expand = FALSE) +
ggplot2::labs(title = title_txt) +
map_theme +
ggplot2::theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 22,
colour = "#051520", family = "Inter"),
plot.margin = margin(2, 2, 2, 2))
}
p_sea <- zoom_plot(sea_xlim, sea_ylim, "Southeast Asia")
p_eu <- zoom_plot(eu_xlim, eu_ylim, "Europe")
p_ea <- zoom_plot(ea_xlim, ea_ylim, "East Asia")
zooms_row <- cowplot::plot_grid(p_eu, p_sea, p_ea, nrow = 1, rel_widths = c(1, 1, 1))Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
df_int_map <- df_gmh |>
dplyr::filter(!is.na(long), !is.na(lat)) |>
dplyr::mutate(
label_txt = sprintf("Location: %s", loc_admin_1))
map_int <-
leaflet::leaflet(
df_int_map,
options = leaflet::leafletOptions(
zoomControl = TRUE,
preferCanvas = TRUE
)
) |>
leaflet::addProviderTiles("CartoDB.Positron") |>
leaflet::addCircleMarkers(
lng = ~long,
lat = ~lat,
radius = 3,
stroke = FALSE,
fillOpacity = 0.5,
fillColor = "#082444",
color = "#082444",
label = ~label_txt,
labelOptions = leaflet::labelOptions(
noHide = FALSE,
direction = "auto",
sticky = TRUE,
opacity = 0.9,
style = list("font-size" = "12px")
)
) |>
leaflet.extras::addSearchOSM(
options = leaflet.extras::searchOptions(initial = FALSE)
) |>
leaflet.extras::addResetMapButton()
map_intA7. Comparison of Sponsored vs. Non-sponsored participants
# Sanity check: View the counts of sponsored participants per country
df_gmh |>
dplyr::filter(sponsored == 1) |>
dplyr::group_by(country) |>
dplyr::summarise(n = dplyr::n()) |>
print_reactable(sorted_col = "country", width = 500)# Sanity check: Compare mean values of mpwb by sponsored status
df_gmh |>
dplyr::group_by(country) |>
dplyr::mutate(
n_sponsored = base::sum(sponsored == 1, na.rm = TRUE)
) |>
# Only include countries with at least 30 sponsored participants
dplyr::filter(n_sponsored >= 30) |>
dplyr::group_by(country, sponsored) |>
dplyr::summarise(
n = dplyr::n(),
mean_mpwb_sum = base::mean(mpwb_sum, na.rm = TRUE),
.groups = "drop"
) |>
tidyr::pivot_wider(
names_from = sponsored,
values_from = c(n, mean_mpwb_sum),
names_prefix = "sponsored_"
) |>
dplyr::mutate(
mean_diff_abs = abs(mean_mpwb_sum_sponsored_1 - mean_mpwb_sum_sponsored_0)
) |>
dplyr::arrange(-mean_diff_abs) |>
print_reactable(sorted_col = "mean_diff_abs", width = 800)# Fit model with random intercepts and slopes for country
df_filtered <- df_gmh |>
dplyr::group_by(country) |>
dplyr::mutate(
n_sponsored = base::sum(sponsored == 1, na.rm = TRUE)
) |>
# Only include countries with at least 30 sponsored participants
dplyr::filter(n_sponsored >= 30)
model_sponsored <- lme4::lmer(
mpwb_sum ~ sponsored + (1 + sponsored | country),
data = df_filtered
)
summary(model_sponsored)Linear mixed model fit by REML ['lmerMod']
Formula: mpwb_sum ~ sponsored + (1 + sponsored | country)
Data: df_filtered
REML criterion at convergence: 110450
Scaled residuals:
Min 1Q Median 3Q Max
-3.8922 -0.6108 0.0377 0.6602 2.5964
Random effects:
Groups Name Variance Std.Dev. Corr
country (Intercept) 7.80 2.793
sponsored 10.57 3.251 -0.51
Residual 112.10 10.588
Number of obs: 14602, groups: country, 18
Fixed effects:
Estimate Std. Error t value
(Intercept) 47.4337 0.6890 68.846
sponsored 1.2853 0.8154 1.576
Correlation of Fixed Effects:
(Intr)
sponsored -0.530
Analysis of Deviance Table (Type II Wald chisquare tests)
Response: mpwb_sum
Chisq Df Pr(>Chisq)
sponsored 2.4848 1 0.1149
A8. Household size distribution
Distribution of household size groups
# Estimate weighted distribution of household size groups
df_hhsize <- df_gmh |>
survey::svydesign(
ids = ~1,
weights = ~ps_weight,
data = _,
nest = TRUE
)
# survey package allows to estimate CI more easily
est <- survey::svymean(~factor(household_size_group), df_hhsize, na.rm = TRUE)
ci <- stats::confint(est)
tibble::tibble(
household_size_group = sub("^factor\\(household_size_group\\)", "", names(est)),
percentage = round(100 * as.numeric(est),1),
ci_l = 100 * ci[, 1],
ci_u = 100 * ci[, 2]
) |>
dplyr::mutate(
household_size_group = trimws(household_size_group)
) |>
dplyr::arrange(household_size_group)# A tibble: 5 × 4
household_size_group percentage ci_l ci_u
<chr> <dbl> <dbl> <dbl>
1 1 27.2 26.7 27.7
2 2 26.6 26.1 27.1
3 3 16.1 15.7 16.5
4 4-5 22.8 22.4 23.3
5 6-20 7.2 6.97 7.52
Distribution of single households (living alone)
df_alone <- df_gmh |>
dplyr::mutate(
is_alone = base::as.integer(household_size == 1)
)
# Sanity check
table(df_alone$is_alone, useNA = "ifany")
0 1
39953 13846
df_alone_w <-
survey::svydesign(
ids = ~1,
weights = ~ps_weight,
data = df_alone,
nest = TRUE
)
overall_w <- function(des) {
est <- survey::svyciprop(~I(is_alone == 1), des, method = "logit", na.rm = TRUE)
ci <- stats::confint(est)
tibble::tibble(
pct_living_alone = round(100 * as.numeric(est), 1),
ci_l = round(100 * ci[1], 1),
ci_u = round(100 * ci[2], 1)
)
}
by_age_w <- function(des) {
survey::svyby(
~I(is_alone == 1),
~age_group,
des,
survey::svyciprop,
method = "logit",
vartype = c("ci"),
na.rm = TRUE,
keep.names = FALSE
) |>
dplyr::transmute(
age_group = age_group,
pct_living_alone = round(100 * `I(is_alone == 1)`, 1),
ci_l = round(100 * ci_l, 1),
ci_u = round(100 * ci_u, 1)
)
}
# Overall
global_overall <- df_alone_w |>
subset(!is.na(is_alone)) |>
overall_w() |>
dplyr::mutate(region = "Global")
eu_overall <- df_alone_w |>
subset(country %in% eu_countries) |>
overall_w() |>
dplyr::mutate(region = "EU")
uk_overall <- df_alone_w |>
subset(country == "UK") |>
overall_w() |>
dplyr::mutate(region = "UK")
usa_overall <- df_alone_w |>
subset(country == "USA") |>
overall_w() |>
dplyr::mutate(region = "USA")
dplyr::bind_rows(global_overall, eu_overall, uk_overall, usa_overall) |>
dplyr::relocate(region, pct_living_alone, ci_l, ci_u) |>
dplyr::arrange(region)# A tibble: 4 × 4
region pct_living_alone ci_l ci_u
<chr> <dbl> <dbl> <dbl>
1 EU 34.7 33.6 35.8
2 Global 27.2 26.7 27.7
3 UK 31 26.3 36.1
4 USA 32.4 30.7 34
# By age
global_age <- df_alone_w |>
by_age_w() |>
dplyr::mutate(region = "Global")
eu_age <- df_alone_w |>
subset(country %in% eu_countries) |>
by_age_w() |>
dplyr::mutate(region = "EU")
uk_age <- df_alone_w |>
subset(country == "UK") |>
by_age_w() |>
dplyr::mutate(region = "UK")
usa_age <- df_alone_w |>
subset(country == "USA") |>
by_age_w() |>
dplyr::mutate(region = "USA")
living_alone_by_age <-
dplyr::bind_rows(global_age, eu_age, uk_age, usa_age) |>
dplyr::relocate(region, age_group, pct_living_alone, ci_l, ci_u) |>
dplyr::arrange(region, age_group)
living_alone_by_age |>
dplyr::select(region, age_group, pct_living_alone) |>
tidyr::pivot_wider(names_from = age_group, values_from = pct_living_alone) |>
dplyr::arrange(region)# A tibble: 4 × 6
region `18-25` `26-44` `45-64` `65-74` `75+`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 EU 38.4 37 28 36.7 51.8
2 Global 28.8 27.3 23.2 33.2 44.4
3 UK 32.3 36.5 22.6 35.3 30.3
4 USA 44.9 32.9 26 26.6 34.3
A10. Order effects on MPWB items
Assessment if randomizing the order of MPWB items removed any confound effect of order. For this, we ignored the specific items, and computed means for the first presented item, second presented item, …, up to the tenth presented item (since there are 10 items in total).
# Identify items in CoreMPWB_DO
map_q_to_col <- c(
"Q5" = "mpwb_competence",
"Q7" = "mpwb_emotional_stability",
"Q9" = "mpwb_engagement",
"Q11" = "mpwb_meaning",
"Q13" = "mpwb_optimism",
"Q15" = "mpwb_positive_emotion",
"Q17" = "mpwb_positive_relationships",
"Q19" = "mpwb_resilience",
"Q21" = "mpwb_self_esteem",
"Q23" = "mpwb_vitality"
)
# determine presentation order from CoreMPWB_DO into long rows with position
order_long <- df_gmh |>
# We only have order data for non-IRL sponsored participants
dplyr::filter(irl == 0) |>
dplyr::select(ResponseId, CoreMPWB_DO) |>
dplyr::mutate(q_tokens = stringr::str_split(CoreMPWB_DO, "\\|")) |>
tidyr::unnest(q_tokens) |>
dplyr::mutate(q_tokens = stringr::str_trim(q_tokens)) |>
dplyr::filter(q_tokens %in% names(map_q_to_col)) |>
dplyr::mutate(mpwb_col = unname(map_q_to_col[q_tokens])) |>
dplyr::group_by(ResponseId) |>
dplyr::mutate(order_pos = dplyr::row_number()) |>
dplyr::select(ResponseId, mpwb_col, order_pos)
resp_long <- df_gmh |>
dplyr::select(ResponseId, dplyr::all_of(mpwb_items)) |>
tidyr::pivot_longer(
cols = dplyr::all_of(mpwb_items),
names_to = "mpwb_col",
values_to = "response"
)
by_order <- resp_long |>
dplyr::inner_join(order_long, by = c("ResponseId", "mpwb_col"))
# summary by presentation position
order_summary <- by_order |>
dplyr::group_by(order_pos) |>
dplyr::summarise(
mean = base::mean(response, na.rm = TRUE),
sd = stats::sd(response, na.rm = TRUE),
n = base::sum(response),
se = sd / sqrt(pmax(n, 1)),
lo = mean - 1.96 * se,
hi = mean + 1.96 * se
) |>
dplyr::right_join(
tibble::tibble(order_pos = 1:length(map_q_to_col)), by = "order_pos") |>
dplyr::arrange(order_pos)
# linear model of response by position
df_order <- by_order |>
dplyr::select(response, position = order_pos)
model_order <- stats::lm(response ~ position, data = df_order)
summary(model_order)
Call:
stats::lm(formula = response ~ position, data = df_order)
Residuals:
Min 1Q Median 3Q Max
-3.7659 -0.7509 0.2541 1.2491 2.2792
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.7709285 0.0044504 1072.021 < 0.0000000000000002 ***
position -0.0050152 0.0007172 -6.992 0.00000000000271 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.494 on 525988 degrees of freedom
Multiple R-squared: 9.294e-05, Adjusted R-squared: 9.104e-05
F-statistic: 48.89 on 1 and 525988 DF, p-value: 0.000000000002708
eta.sq eta.sq.part
position "0.0001" "0.0001"
# plot presentation-order effect
ggplot2::ggplot(order_summary, ggplot2::aes(x = order_pos, y = mean)) +
ggplot2::geom_ribbon(
ggplot2::aes(ymin = lo, ymax = hi), alpha = 0.5, fill = "#abc7ff") +
ggplot2::geom_line(linewidth = 0.7, na.rm = TRUE, colour = "#11357f") +
ggplot2::geom_point(size = 1.4, na.rm = TRUE, colour = "#11357f") +
ggplot2::scale_x_continuous(breaks = 1:length(map_q_to_col)) +
ggplot2::scale_y_continuous(breaks = 4:7, limits = c(4, 7)) +
ggplot2::labs(x = "Presentation Order", y = "MPWB average") +
theme_gmh +
ggplot2::theme(
panel.grid.major.x = ggplot2::element_line(color = "#ddeded", linewidth = 0.25))ggplot2::ggplot(order_summary, ggplot2::aes(x = order_pos, y = mean)) +
ggplot2::geom_ribbon(
ggplot2::aes(ymin = lo, ymax = hi), alpha = 0.5, fill = "#abc7ff") +
ggplot2::geom_line(linewidth = 0.7, na.rm = TRUE, colour = "#11357f") +
ggplot2::geom_point(size = 1.4, na.rm = TRUE, colour = "#11357f") +
ggplot2::scale_x_continuous(breaks = 1:length(map_q_to_col)) +
ggplot2::labs(x = "Presentation Order", y = "MPWB average") +
theme_gmh +
ggplot2::theme(
panel.grid.major.x = ggplot2::element_line(color = "#ddeded", linewidth = 0.25))A11. Relationship between % Female participants and MPWB sum by country
# Compute unweighted % female and mean MPWB sum per country
country_fem <- df_gmh |>
dplyr::filter(sex_reviewed_cat %in% c("Male", "Female")) |>
dplyr::group_by(country) |>
dplyr::summarise(
n_total = dplyr::n(),
n_female = base::sum(sex_reviewed_cat == "Female"),
pct_female = (n_female / n_total) * 100,
mean_mpwb = base::mean(mpwb_sum, na.rm = TRUE)
)
# Correlation
cor.test(country_fem$pct_female, country_fem$mean_mpwb, method = "pearson")
Pearson's product-moment correlation
data: country_fem$pct_female and country_fem$mean_mpwb
t = -0.88104, df = 90, p-value = 0.3806
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.2917633 0.1145144
sample estimates:
cor
-0.09247186
Call:
lm(formula = mean_mpwb ~ pct_female, data = country_fem)
Residuals:
Min 1Q Median 3Q Max
-6.1881 -2.0496 -0.0373 2.0868 8.8472
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 49.08313 1.55941 31.475 <0.0000000000000002 ***
pct_female -0.02219 0.02519 -0.881 0.381
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.225 on 90 degrees of freedom
Multiple R-squared: 0.008551, Adjusted R-squared: -0.002465
F-statistic: 0.7762 on 1 and 90 DF, p-value: 0.3806
# Plot
country_flags <- df_gmh |>
dplyr::group_by(country) |>
dplyr::summarise(
iso2 = tolower(dplyr::coalesce(dplyr::first(na.omit(iso2)), NA_character_)),
alpha_country =
dplyr::if_else(any(ps_weight_flag == 1, na.rm = TRUE), 0.5, 1, missing = 1)
)
country_fem <- country_fem |>
dplyr::left_join(country_flags, by = "country")
plot_fem <- ggplot2::ggplot(country_fem, ggplot2::aes(x = pct_female, y = mean_mpwb)) +
ggplot2::geom_point(ggplot2::aes(alpha = alpha_country),
shape = 21, colour = "#051520", size = 4) +
ggfx::with_shadow(
ggplot2::geom_point(ggplot2::aes(alpha = alpha_country), size = 4.2, alpha = 0.5, stroke = 0),
sigma = 2, colour = "gray60", x_offset = 1, y_offset = 1
) +
ggflags::geom_flag(ggplot2::aes(country = iso2), size = 3.5, na.rm = TRUE) +
ggplot2::labs(
subtitle = paste("Pearson's r =", round(cor_test$estimate, 3)),
x = "% Female participants",
y = "MPWB Sum"
) +
theme_gmh +
ggplot2::guides(alpha = "none") +
ggplot2::coord_flip()Error: object 'cor_test' not found
A12. Global rankings on MPWB and Life Satisfaction
Figure 2
# Basic definitions for the plot
lane_width <- 40
center_step <- 2 * lane_width
y_min_sum <- 29
y_max_sum <- 61
dy <- 0.5
overlay_alpha_on <- 0.3
# Define the labels for x-axis
mpwb_names <- c(
mpwb_positive_relationships = "Positive\nrelationships",
mpwb_meaning = "Meaning",
mpwb_competence = "Competence",
mpwb_engagement = "Engagement",
mpwb_self_esteem = "Self-esteem",
mpwb_positive_emotion = "Positive\nemotions",
mpwb_optimism = "Optimism",
mpwb_resilience = "Resilience",
mpwb_emotional_stability = "Emotional\nstability",
mpwb_vitality = "Vitality",
MPWB_sum = "MPWB sum\n",
Life_satisfaction = "Life\nsatisfaction"
)
# gradient palette on 10–70 reference (mpwb_sum)
pal10_70 <- scales::gradient_n_pal(
colours = c("#e74c3c", "white", "#2ecc71"),
values = scales::rescale(c(10, 40, 70), to = c(0, 1), from = c(10, 70))
)
# Build per-plot gradient rectangles on native y scale, colored by equivalent 10–70
make_bg <- function(lane_df,
y_min_native,
y_max_native,
dy_native,
native_to_10_70) {
y_breaks <- seq(y_min_native, y_max_native - dy_native, by = dy_native)
bg <- tidyr::expand_grid(x_id = lane_df$x_id, y0 = y_breaks) |>
dplyr::left_join(lane_df, by = "x_id") |>
dplyr::mutate(
xmin = x_pos - lane_width,
xmax = x_pos + lane_width,
ymin = y0,
ymax = y0 + dy_native,
y_mid = (ymin + ymax) / 2,
y10_70 = native_to_10_70(y_mid),
fill = pal10_70(scales::rescale(
y10_70, to = c(0, 1), from = c(10, 70)
))
)
bg
}
make_lane_geometry <- function(lane_df, y_min, y_max) {
list(
overlay = lane_df |>
dplyr::mutate(
xmin = x_pos - lane_width,
xmax = x_pos + lane_width,
ymin = y_min,
ymax = y_max,
overlay_alpha = if_else(x_id %% 2 == 1, overlay_alpha_on, 0)
),
edges = lane_df |>
dplyr::transmute(
x = x_pos + lane_width,
xend = x_pos + lane_width,
y = y_min,
yend = y_max
)
)
}
# Global weighted means for ordering and mean lines
# Item global weighted means
item_wmeans <- tibble::tibble(item = mpwb_items) |>
dplyr::mutate(
wmean = purrr::map_dbl(item, ~ Hmisc::wtd.mean(df_gmh[[.x]], weights = df_gmh$ps_weight, na.rm = TRUE))
) |>
dplyr::arrange(dplyr::desc(wmean))
items_ordered <- item_wmeans$item
nice_items_ordered <- unname(mpwb_names[items_ordered])
# Global means (for vertical lines)
mean_sum_global <- as.numeric(Hmisc::wtd.mean(df_gmh$mpwb_sum, weights = df_gmh$ps_weight, na.rm = TRUE))
mean_ls_global <- as.numeric(Hmisc::wtd.mean(df_gmh$life_satisfaction, weights = df_gmh$ps_weight, na.rm = TRUE))
means_items_global <- item_wmeans |>
dplyr::rename(measure = item, mean_native = wmean)
# EU-level means
df_eu <- df_gmh |>
dplyr::filter(country %in% eu_countries)
eu_items <- tibble::tibble(measure = mpwb_items) |>
dplyr::mutate(mean_val = purrr::map_dbl(measure, ~ Hmisc::wtd.mean(df_eu[[.x]], weights = df_eu$ps_weight, na.rm = TRUE))) |>
dplyr::mutate(country = "EU", iso2 = "eu", flagged = FALSE)
eu_sum <- tibble::tibble(
country = "EU",
MPWB_sum = as.numeric(Hmisc::wtd.mean(df_eu$mpwb_sum, weights = df_eu$ps_weight, na.rm = TRUE)),
iso2 = "eu",
flagged = FALSE
)
eu_ls <- tibble::tibble(
country = "EU",
Life_satisfaction = as.numeric(Hmisc::wtd.mean(df_eu$life_satisfaction, weights = df_eu$ps_weight, na.rm = TRUE)),
iso2 = "eu",
flagged = FALSE
)
# Flagged countries during weighting (see Section A0.4)
# Country-level weighted means for items
means_cty_items <- df_gmh |>
dplyr::group_by(country, iso2) |>
dplyr::mutate(iso2 = tolower(iso2)) |>
dplyr::summarise(
dplyr::across(
.cols = dplyr::all_of(mpwb_items),
.fns = ~ Hmisc::wtd.mean(.x, weights = ps_weight, na.rm = TRUE),
.names = "{.col}"
)
) |>
tidyr::pivot_longer(
cols = dplyr::all_of(mpwb_items),
names_to = "measure",
values_to = "mean_val"
) |>
dplyr::mutate(flagged = country %in% flagged_countries) |>
dplyr::bind_rows(eu_items)
# Country-level MPWB sum
means_cty_sum <- df_gmh |>
dplyr::group_by(country, iso2) |>
dplyr::summarise(
MPWB_sum = as.numeric(Hmisc::wtd.mean(mpwb_sum, weights = ps_weight, na.rm = TRUE))
) |>
dplyr::mutate(flagged = country %in% flagged_countries) |>
dplyr::bind_rows(eu_sum)
# Country-level Life satisfaction
means_cty_ls <- df_gmh |>
dplyr::group_by(country, iso2) |>
dplyr::summarise(
Life_satisfaction = as.numeric(Hmisc::wtd.mean(life_satisfaction, weights = ps_weight, na.rm = TRUE))
) |>
dplyr::mutate(flagged = country %in% flagged_countries) |>
dplyr::bind_rows(eu_ls)
# Lane positions
lane_items <- tibble::tibble(
x_id = seq_along(items_ordered),
label = unname(nice_items_ordered),
x_pos = (seq_along(items_ordered) - 1) * center_step
)
lane_sum <- tibble::tibble(x_id = 1L,
label = unname(mpwb_names["MPWB_sum"]),
x_pos = 0)
lane_ls <- tibble::tibble(x_id = 1L,
label = unname(mpwb_names["Life_satisfaction"]),
x_pos = 0)
# Vertical alignment
y_min_items <- 1 + (y_min_sum - 10) / 10
y_max_items <- 1 + (y_max_sum - 10) / 10
y_min_ls <- (y_min_sum - 10) / 6
y_max_ls <- (y_max_sum - 10) / 6
# background coloring
to10_70_items <- function(y)
10 + (y - 1) * 10
to10_70_sum <- function(y)
y
to10_70_ls <- function(y)
10 + y * 6
bg_items <- make_bg(
lane_items,
y_min_items,
y_max_items,
dy_native = dy / 10,
native_to_10_70 = to10_70_items
)
bg_sum <- make_bg(lane_sum,
y_min_sum,
y_max_sum,
dy_native = dy,
native_to_10_70 = to10_70_sum)
bg_ls <- make_bg(lane_ls,
y_min_ls,
y_max_ls,
dy_native = dy / 6,
native_to_10_70 = to10_70_ls)
geo_items <- make_lane_geometry(lane_items, y_min_items, y_max_items)
geo_sum <- make_lane_geometry(lane_sum, y_min_sum, y_max_sum)
geo_ls <- make_lane_geometry(lane_ls, y_min_ls, y_max_ls)
# Jittered points
set.seed(123)
points_items <- means_cty_items |>
dplyr::mutate(
x_id = match(measure, items_ordered),
x_pos = (x_id - 1) * center_step,
x_jit = x_pos + runif(n(), -0.7 * lane_width, 0.7 * lane_width),
y_val = mean_val,
iso2 = tolower(iso2)
)
# MPWB sum and Life Satisfaction points
points_sum <- means_cty_sum |>
dplyr::transmute(
x_jit = runif(dplyr::n(), -0.7 * lane_width, 0.7 * lane_width),
y_val = MPWB_sum,
iso2 = tolower(iso2)
) |>
dplyr::left_join(means_cty_sum |> dplyr::select(iso2, flagged), by = "iso2")Adding missing grouping variables: `country`
points_ls <- means_cty_ls |>
dplyr::transmute(
x_jit = runif(dplyr::n(), -0.7 * lane_width, 0.7 * lane_width),
y_val = Life_satisfaction,
iso2 = tolower(iso2)
) |>
dplyr::left_join(means_cty_ls |> dplyr::select(iso2, flagged), by = "iso2")Adding missing grouping variables: `country`
mean_lines_items <- means_items_global |>
dplyr::mutate(
x_id = match(measure, items_ordered),
x_pos = (x_id - 1) * center_step,
xmin = x_pos - lane_width,
xmax = x_pos + lane_width,
y = mean_native
)
mean_line_sum <- tibble::tibble(xmin = -lane_width,
xmax = lane_width,
y = mean_sum_global)
mean_line_ls <- tibble::tibble(xmin = -lane_width,
xmax = lane_width,
y = mean_ls_global)
# Panel MPWB items
p_items <- ggplot2::ggplot(points_items, ggplot2::aes(x = x_jit, y = y_val)) +
ggplot2::geom_rect(
data = bg_items,
ggplot2::aes(
xmin = xmin,
xmax = xmax,
ymin = ymin,
ymax = ymax,
fill = fill
),
inherit.aes = FALSE,
color = NA
) +
ggplot2::scale_fill_identity() +
ggplot2::geom_rect(
data = geo_items$overlay,
ggplot2::aes(
xmin = xmin,
xmax = xmax,
ymin = ymin,
ymax = ymax,
alpha = overlay_alpha
),
inherit.aes = FALSE,
fill = "white",
color = NA
) +
ggplot2::scale_alpha_identity() +
ggplot2::geom_segment(
data = geo_items$edges[-nrow(geo_items$edges), ],
ggplot2::aes(
x = x,
xend = xend,
y = y,
yend = yend
),
inherit.aes = FALSE,
color = "#b5bec9",
linewidth = 0.2
) +
ggplot2::geom_segment(
data = mean_lines_items,
ggplot2::aes(
x = xmin,
xend = xmax,
y = y,
yend = y
),
inherit.aes = FALSE,
color = "#9cacbc",
linewidth = 0.4,
linetype = "solid"
) +
ggplot2::geom_point(
shape = 21,
size = 3.2,
color = "#0B2E55",
na.rm = TRUE
) +
ggflags::geom_flag(ggplot2::aes(country = iso2),
size = 3,
na.rm = TRUE) +
ggplot2::scale_x_continuous(
limits = c(
min(lane_items$x_pos - lane_width),
max(lane_items$x_pos + lane_width)
),
breaks = lane_items$x_pos,
labels = lane_items$label,
expand = c(0, 0)
) +
ggplot2::scale_y_continuous(breaks = pretty(c(y_min_items, y_max_items), n = 7), expand = c(0, 0)) +
ggplot2::labs(x = NULL, y = "Scores\n") +
ggplot2::coord_cartesian(ylim = c(y_min_items, y_max_items), clip = "off") +
theme_gmh
# Panel MPWB sum
p_sum <- ggplot2::ggplot(points_sum, ggplot2::aes(x = x_jit, y = y_val)) +
ggplot2::geom_rect(
data = bg_sum,
ggplot2::aes(
xmin = xmin,
xmax = xmax,
ymin = ymin,
ymax = ymax,
fill = fill
),
inherit.aes = FALSE,
color = NA
) +
ggplot2::scale_fill_identity() +
ggplot2::geom_rect(
data = geo_sum$overlay,
ggplot2::aes(
xmin = xmin,
xmax = xmax,
ymin = ymin,
ymax = ymax,
alpha = overlay_alpha
),
inherit.aes = FALSE,
fill = "white",
color = NA
) +
ggplot2::scale_alpha_identity() +
ggplot2::geom_segment(
data = geo_sum$edges[-nrow(geo_sum$edges), ],
ggplot2::aes(
x = x,
xend = xend,
y = y,
yend = yend
),
inherit.aes = FALSE,
color = "#b5bec9",
linewidth = 0.2
) +
ggplot2::geom_segment(
data = mean_line_sum,
ggplot2::aes(
x = xmin,
xend = xmax,
y = y,
yend = y
),
inherit.aes = FALSE,
color = "#9cacbc",
linewidth = 0.4,
linetype = "solid"
) +
ggplot2::geom_point(
shape = 21,
size = 3.2,
color = "#0B2E55",
na.rm = TRUE
) +
ggplot2::geom_point(
data = points_sum |> dplyr::filter(flagged == TRUE),
shape = 21,
size = 3.2,
color = "#b5bec9",
na.rm = TRUE
) +
ggflags::geom_flag(ggplot2::aes(country = iso2),
size = 3,
na.rm = TRUE) +
ggplot2::scale_x_continuous(
limits = c(-lane_width, lane_width),
breaks = 0,
labels = mpwb_names["MPWB_sum"],
expand = c(0, 0)
) +
ggplot2::scale_y_continuous(breaks = seq(30, 60, by = 5), expand = c(0, 0)) +
ggplot2::labs(x = NULL, y = NULL) +
ggplot2::coord_cartesian(ylim = c(y_min_sum, y_max_sum), clip = "off") +
theme_gmh
# Panel Life satisfaction
p_ls <- ggplot2::ggplot(points_ls, ggplot2::aes(x = x_jit, y = y_val)) +
ggplot2::geom_rect(
data = bg_ls,
ggplot2::aes(
xmin = xmin,
xmax = xmax,
ymin = ymin,
ymax = ymax,
fill = fill
),
inherit.aes = FALSE,
color = NA
) +
ggplot2::scale_fill_identity() +
ggplot2::geom_rect(
data = geo_ls$overlay,
ggplot2::aes(
xmin = xmin,
xmax = xmax,
ymin = ymin,
ymax = ymax,
alpha = overlay_alpha
),
inherit.aes = FALSE,
fill = "white",
color = NA
) +
ggplot2::scale_alpha_identity() +
ggplot2::geom_segment(
data = geo_ls$edges[-nrow(geo_ls$edges), ],
ggplot2::aes(
x = x,
xend = xend,
y = y,
yend = yend
),
inherit.aes = FALSE,
color = "#b5bec9",
linewidth = 0.2
) +
ggplot2::geom_segment(
data = mean_line_ls,
ggplot2::aes(
x = xmin,
xend = xmax,
y = y,
yend = y
),
inherit.aes = FALSE,
color = "#9cacbc",
linewidth = 0.4,
linetype = "solid"
) +
ggplot2::geom_point(
shape = 21,
size = 3.2,
color = "#0B2E55",
na.rm = TRUE
) +
ggplot2::geom_point(
data = points_ls |> dplyr::filter(flagged == TRUE),
shape = 21,
size = 3.2,
color = "#b5bec9",
na.rm = TRUE
) +
ggflags::geom_flag(ggplot2::aes(country = iso2),
size = 3,
na.rm = TRUE) +
ggplot2::scale_x_continuous(
limits = c(-lane_width, lane_width),
breaks = 0,
labels = mpwb_names["Life_satisfaction"],
expand = c(0, 0)
) +
ggplot2::scale_y_continuous(
limits = c(y_min_ls, y_max_ls),
breaks = c(3.33, 4.17, 5.00, 5.83, 6.67, 7.50, 8.33),
expand = c(0, 0)
) +
ggplot2::labs(x = NULL, y = NULL) +
ggplot2::coord_cartesian(ylim = c(y_min_ls, y_max_ls), clip = "off")Figure 3
dnk_nor <-
points_items |>
dplyr::filter(country == "Denmark"| country == "Norway")
p_fig3 <- ggplot2::ggplot(dnk_nor, ggplot2::aes(x = x_jit, y = y_val)) +
ggplot2::geom_rect(
data = bg_items,
ggplot2::aes(
xmin = xmin,
xmax = xmax,
ymin = ymin,
ymax = ymax,
fill = fill
),
inherit.aes = FALSE,
color = NA
) +
ggplot2::scale_fill_identity() +
ggplot2::geom_rect(
data = geo_items$overlay,
ggplot2::aes(
xmin = xmin,
xmax = xmax,
ymin = ymin,
ymax = ymax,
alpha = overlay_alpha
),
inherit.aes = FALSE,
fill = "white",
color = NA
) +
ggplot2::scale_alpha_identity() +
ggplot2::geom_segment(
data = geo_items$edges[-nrow(geo_items$edges), ],
ggplot2::aes(
x = x,
xend = xend,
y = y,
yend = yend
),
inherit.aes = FALSE,
color = "#b5bec9",
linewidth = 0.2
) +
ggplot2::geom_segment(
data = mean_lines_items,
ggplot2::aes(
x = xmin,
xend = xmax,
y = y,
yend = y
),
inherit.aes = FALSE,
color = "#9cacbc",
linewidth = 0.4,
linetype = "solid"
) +
ggplot2::geom_point(
shape = 21,
size = 5.2,
color = "#0B2E55",
na.rm = TRUE
) +
ggflags::geom_flag(ggplot2::aes(country = iso2),
size = 5,
na.rm = TRUE) +
ggplot2::scale_x_continuous(
limits = c(
min(lane_items$x_pos - lane_width),
max(lane_items$x_pos + lane_width)
),
breaks = lane_items$x_pos,
labels = lane_items$label,
expand = c(0, 0)
) +
ggplot2::scale_y_continuous(breaks = pretty(c(y_min_items, y_max_items), n = 7), expand = c(0, 0)) +
ggplot2::labs(x = NULL, y = "Scores\n") +
ggplot2::coord_cartesian(ylim = c(y_min_items, y_max_items), clip = "off") +
theme_gmhLife Satisfaction Distribution
# Weighted mean life satisfaction by country
country_stats <- df_gmh |>
dplyr::group_by(country) |>
dplyr::summarise(
mean_ls = Hmisc::wtd.mean(
x = life_satisfaction,
weights = ps_weight,
na.rm = TRUE
)
)
# Weighted distribution
dist_ls <- df_gmh |>
dplyr::group_by(country, life_satisfaction) |>
dplyr::summarise(
w_n = sum(ps_weight, na.rm = TRUE),
.groups = "drop"
) |>
tidyr::complete(
country,
life_satisfaction = 0:10,
fill = list(w_n = 0)
) |>
dplyr::group_by(country) |>
dplyr::mutate(
w_total = sum(w_n),
pct = ifelse(w_total > 0, 100 * w_n / w_total, 0),
contrib = life_satisfaction * pct / 100
) |>
dplyr::ungroup() |>
dplyr::left_join(country_stats, by = "country") |>
dplyr::mutate(
country = forcats::fct_reorder(country, mean_ls),
life_satisfaction = factor(life_satisfaction, levels = 0:10)
)
# color palette
ls_palette <- grDevices::colorRampPalette(
c("#020C14","#0047AB","#7EA1D6","#D5ABF7", "#FFFFCC",
"#B6E3B6", "#00A36C","#13948B", "#054732")
)(10)
# interactive stacked bar
p_ls <- dist_ls |>
dplyr::arrange(life_satisfaction) |>
plotly::plot_ly(
x = ~contrib,
y = ~country,
type = "bar",
orientation = "h",
color = ~life_satisfaction,
colors = ls_palette,
text = ~paste0(country, "<br>",
"Life satisfaction Level: ", life_satisfaction, ",",
sprintf("%.1f", pct), "%<br>"
),
hoverinfo = "text"
) |>
plotly::layout(
barmode = "stack",
yaxis = list(
title = ""
),
xaxis = list(
title = "Life satisfaction",
range = c(0, 10)
),
legend = list(
orientation = "h",
title = list(text = "Life satisfaction"),
y = 1.05,
x = 0
)
)
# add mean labels above bars
ann_df <- country_stats |>
dplyr::mutate(
country = forcats::fct_reorder(country, mean_ls),
mean_label = sprintf("%.2f", mean_ls)
)
p_ls <- p_ls |>
plotly::layout(
annotations = lapply(seq_len(nrow(ann_df)), function(i) {
list(
y = ann_df$country[i],
x = ann_df$mean_ls[i] + 0.1,
text = ann_df$mean_label[i],
showarrow = FALSE,
yanchor = "left",
font = list(size = 8)
)
})
)
p_lsCorrelation between Life satisfaction and MPWB items
# Unweighted Pearson's r
lapply(mpwb_items, function(i) {
test <- stats::cor.test(df_gmh[[i]], df_gmh$life_satisfaction, method = "pearson")
tibble(
dimension = i,
r = unname(test$estimate),
p = test$p.value
)
})|>
dplyr::bind_rows()# A tibble: 10 × 3
dimension r p
<chr> <dbl> <dbl>
1 mpwb_positive_relationships 0.425 0
2 mpwb_meaning 0.569 0
3 mpwb_competence 0.553 0
4 mpwb_engagement 0.408 0
5 mpwb_self_esteem 0.604 0
6 mpwb_optimism 0.611 0
7 mpwb_positive_emotion 0.703 0
8 mpwb_emotional_stability 0.600 0
9 mpwb_resilience 0.497 0
10 mpwb_vitality 0.600 0
item r t p
1 mpwb_positive_relationships 0.427 98.43 <.001
2 mpwb_meaning 0.575 161.30 <.001
3 mpwb_competence 0.554 155.62 <.001
4 mpwb_engagement 0.413 102.28 <.001
5 mpwb_self_esteem 0.611 183.50 <.001
6 mpwb_optimism 0.613 183.36 <.001
7 mpwb_positive_emotion 0.709 244.83 <.001
8 mpwb_emotional_stability 0.597 177.76 <.001
9 mpwb_resilience 0.500 133.30 <.001
10 mpwb_vitality 0.598 189.27 <.001
Correlation between MPWB items and MPWB Sum
item r t p
1 mpwb_positive_relationships 0.581 176.19 <.001
2 mpwb_meaning 0.786 394.33 <.001
3 mpwb_competence 0.770 359.18 <.001
4 mpwb_engagement 0.636 198.18 <.001
5 mpwb_self_esteem 0.813 439.34 <.001
6 mpwb_optimism 0.786 413.00 <.001
7 mpwb_positive_emotion 0.829 500.51 <.001
8 mpwb_emotional_stability 0.768 379.80 <.001
9 mpwb_resilience 0.700 266.17 <.001
10 mpwb_vitality 0.784 429.53 <.001
# Leave-one-out correlations
purrr::map_dfr(mpwb_items, function(item) {
x_vec <- df_gmh[[item]]
df_tmp <- df_gmh |> dplyr::mutate(loo_total = mpwb_sum - x_vec)
res <- rlang::eval_tidy(
rlang::call2("weighted_corr", df_tmp, rlang::sym(item), rlang::sym("loo_total"))
)
tibble::tibble(
dimension = item,
r = res[[1]],
t = res[[2]],
p = res[[3]]
)
})# A tibble: 10 × 4
dimension r t p
<chr> <chr> <chr> <chr>
1 mpwb_positive_relationships 0.483 132.04 <.001
2 mpwb_meaning 0.726 288.87 <.001
3 mpwb_competence 0.708 280.15 <.001
4 mpwb_engagement 0.555 148.43 <.001
5 mpwb_self_esteem 0.759 318.11 <.001
6 mpwb_optimism 0.721 294.42 <.001
7 mpwb_positive_emotion 0.780 383.33 <.001
8 mpwb_emotional_stability 0.702 265.15 <.001
9 mpwb_resilience 0.621 198.72 <.001
10 mpwb_vitality 0.720 299.57 <.001
Correlation between Ranking of Life Satisfaction and MPWB Sum by Country
# Country estimates
means_cty_rank <-
df_gmh |>
dplyr::group_by(country, iso2) |>
dplyr::summarise(
mpwb_mean = base::mean(mpwb_sum, na.rm = TRUE),
ls_mean = base::mean(life_satisfaction, na.rm = TRUE),
mpwb_mean_w = base::sum(mpwb_sum * ps_weight, na.rm = TRUE) /
base::sum(ps_weight[!base::is.na(mpwb_sum)], na.rm = TRUE),
ls_mean_w = base::sum(life_satisfaction * ps_weight, na.rm = TRUE) /
base::sum(ps_weight[!base::is.na(life_satisfaction)], na.rm = TRUE),
.groups = "drop"
) |>
dplyr::mutate(
iso2 = base::tolower(iso2),
alpha_country = dplyr::if_else(country %in% flagged_countries, 0.5, 1),
rank_ls = dplyr::min_rank(dplyr::desc(ls_mean)),
rank_mpwb = dplyr::min_rank(dplyr::desc(mpwb_mean)),
rank_ls_w = dplyr::min_rank(dplyr::desc(ls_mean_w)),
rank_mpwb_w = dplyr::min_rank(dplyr::desc(mpwb_mean_w))
)
means_cty_rank |>
print_reactable(sorted_col = "rank_ls_w", width = 800)# Correlations unweighted
rho_rank_unw <- stats::cor(
means_cty_rank$rank_ls,
means_cty_rank$rank_mpwb,
method = "pearson"
)
tau_rank_unw <- stats::cor(
means_cty_rank$rank_ls,
means_cty_rank$rank_mpwb,
method = "kendall"
)
r_mean_p_unw <- stats::cor(
means_cty_rank$ls_mean,
means_cty_rank$mpwb_mean,
method = "pearson"
)
r_mean_s_unw <- stats::cor(
means_cty_rank$ls_mean,
means_cty_rank$mpwb_mean,
method = "spearman"
)
# Correlations weighted
rho_rank_w <- stats::cor(
means_cty_rank$rank_ls_w,
means_cty_rank$rank_mpwb_w,
method = "pearson"
)
tau_rank_w <- stats::cor(
means_cty_rank$rank_ls_w,
means_cty_rank$rank_mpwb_w,
method = "kendall"
)
r_mean_p_w <- stats::cor(
means_cty_rank$ls_mean_w,
means_cty_rank$mpwb_mean_w,
method = "pearson"
)
r_mean_s_w <- stats::cor(
means_cty_rank$ls_mean_w,
means_cty_rank$mpwb_mean_w,
method = "spearman"
)
# Summary
tibble::tibble(
weighting = c("unweighted", "weighted"),
rho_rank = c(rho_rank_unw, rho_rank_w),
tau_rank = c(tau_rank_unw, tau_rank_w),
r_mean_p = c(r_mean_p_unw, r_mean_p_w),
r_mean_s = c(r_mean_s_unw, r_mean_s_w)
) |>
dplyr::mutate(
dplyr::across(
dplyr::where(base::is.numeric),
~ base::round(.x, 3)
)
)# A tibble: 2 × 5
weighting rho_rank tau_rank r_mean_p r_mean_s
<chr> <dbl> <dbl> <dbl> <dbl>
1 unweighted 0.832 0.656 0.842 0.832
2 weighted 0.853 0.683 0.853 0.853
Show the code
# Weighted Ranks (1 = highest)
ggplot2::ggplot(means_cty_rank,
ggplot2::aes(x = rank_ls_w, y = rank_mpwb_w)) +
ggplot2::geom_smooth(
method = "lm",
se = FALSE,
color = "#6F7C91",
linewidth = 0.8
) +
ggplot2::labs(
subtitle = bquote("Pearson's r" == .(sprintf("%.3f", rho_rank_w))),
x = "Life Satisfaction Rank",
y = "MPWB Sum Rank"
) +
ggplot2::geom_point(
ggplot2::aes(alpha = alpha_country),
shape = 21,
colour = "#051520",
size = 3.8
) +
ggfx::with_shadow(
ggplot2::geom_point(
ggplot2::aes(alpha = alpha_country),
size = 4.2,
alpha = 0.5,
stroke = 0
),
sigma = 2,
colour = "gray60",
x_offset = 1,
y_offset = 1
) +
ggflags::geom_flag(
ggplot2::aes(country = iso2),
size = 3.5,
na.rm = TRUE
) +
ggplot2::theme(
legend.position = "none",
panel.grid.major.y = ggplot2::element_line(color = "#ECF3F3", linewidth = 0.4),
axis.line.x = element_blank()
) +
ggplot2::guides(alpha = "none") +
ggplot2::coord_flip() +
ggplot2::scale_x_reverse() +
ggplot2::scale_y_reverse()Show the code
# Weighted Means
ggplot2::ggplot(means_cty_rank, ggplot2::aes(x = ls_mean_w, y = mpwb_mean_w)) +
ggplot2::geom_smooth(
method = "lm",
se = FALSE,
color = "#6F7C91",
linewidth = 0.8
) +
ggplot2::labs(
subtitle = bquote("Pearson's r" == .(sprintf("%.2f", r_mean_p_w))),
x = "Life Satisfaction",
y = "MPWB Sum"
) +
ggplot2::geom_point(
ggplot2::aes(alpha = alpha_country),
shape = 21,
colour = "#051520",
size = 3.8
) +
ggfx::with_shadow(
ggplot2::geom_point(
ggplot2::aes(alpha = alpha_country),
size = 4.2,
alpha = 0.5,
stroke = 0
),
sigma = 2,
colour = "gray60",
x_offset = 1,
y_offset = 1
) +
ggflags::geom_flag(
ggplot2::aes(country = iso2),
size = 3.5,
na.rm = TRUE
) +
ggplot2::theme(
legend.position = "none",
panel.grid.major.y = ggplot2::element_line(color = "#ECF3F3", linewidth = 0.4),
axis.line.x = element_blank()
) +
ggplot2::guides(alpha = "none") +
ggplot2::coord_flip()# Compare with ladder item from World Happiness Report
whr <- tribble(
~iso2, ~ladder_score,
"FI", 7.736,
"DK", 7.521,
"SE", 7.345,
"NL", 7.306,
"NO", 7.262,
"IL", 7.234,
"MX", 6.979,
"AU", 6.974,
"CH", 6.935,
"BE", 6.91,
"IE", 6.889,
"AT", 6.81,
"CA", 6.803,
"SI", 6.792,
"CZ", 6.775,
"AE", 6.759,
"DE", 6.753,
"GB", 6.728,
"US", 6.724,
"PL", 6.673,
"TW", 6.669,
"UY", 6.661,
"XK", 6.659,
"KW", 6.629,
"RS", 6.606,
"SA", 6.6,
"FR", 6.593,
"SG", 6.565,
"RO", 6.563,
"BR", 6.494,
"ES", 6.466,
"EE", 6.417,
"IT", 6.415,
"AR", 6.397,
"KZ", 6.378,
"CL", 6.361,
"TH", 6.222,
"SK", 6.221,
"LV", 6.207,
"OM", 6.197,
"UZ", 6.193,
"JP", 6.147,
"BA", 6.136,
"PH", 6.107,
"KR", 6.038,
"BH", 6.03,
"PT", 6.013,
"EC", 5.965,
"MY", 5.955,
"PE", 5.947,
"RU", 5.945,
"CY", 5.942,
"CN", 5.921,
"HU", 5.915,
"ME", 5.877,
"HR", 5.87,
"BO", 5.868,
"KG", 5.858,
"MN", 5.833,
"MD", 5.819,
"GR", 5.776,
"ID", 5.617,
"DZ", 5.571,
"BG", 5.554,
"MK", 5.503,
"AM", 5.494,
"HK", 5.491,
"AL", 5.411,
"GE", 5.4,
"TR", 5.262,
"ZA", 5.213,
"MZ", 5.19,
"IR", 5.093,
"NG", 4.885,
"SN", 4.856,
"PK", 4.768,
"UA", 4.68,
"MA", 4.622,
"UG", 4.461,
"IN", 4.389,
"TD", 4.384,
"MG", 4.157,
"ZM", 3.912,
"ET", 3.898,
"BD", 3.851,
"EG", 3.817,
"YE", 3.561,
"ZW", 3.396,
"LB", 3.188
) |>
dplyr::mutate(
iso2 = base::tolower(iso2)
)
means_cty_rank_whr <- means_cty_rank |>
dplyr::left_join(whr, by = c("iso2")) |>
dplyr::mutate(
rank_whr = dplyr::min_rank(dplyr::desc(ladder_score))
) |>
dplyr::filter(!is.na(ladder_score))
stats::cor(
means_cty_rank_whr$rank_ls_w,
means_cty_rank_whr$rank_whr,
method = "pearson"
) |> round(3)[1] 0.032
stats::cor(
means_cty_rank_whr$rank_ls_w,
means_cty_rank_whr$rank_whr,
method = "kendall"
) |> round(3)[1] 0.022
stats::cor(
means_cty_rank_whr$ls_mean_w,
means_cty_rank_whr$ladder_score,
method = "pearson"
) |> round(3)[1] 0.129
stats::cor(
means_cty_rank_whr$ls_mean_w,
means_cty_rank_whr$ladder_score,
method = "spearman"
) |> round(3)[1] 0.031
# Plot
ggplot2::ggplot(means_cty_rank_whr,
ggplot2::aes(x = rank_ls_w, y = rank_whr)) +
ggplot2::labs(
subtitle = bquote("Pearson's r" == .(sprintf("%.3f", rho_rank_w_whr))),
x = "Life Satisfaction Rank",
y = "Ladder Rank (World Happiness Report)"
) +
ggplot2::geom_point(
ggplot2::aes(alpha = alpha_country),
shape = 21,
colour = "#051520",
size = 3.8
) +
ggfx::with_shadow(
ggplot2::geom_point(
ggplot2::aes(alpha = alpha_country),
size = 4.2,
alpha = 0.5,
stroke = 0
),
sigma = 2,
colour = "gray60",
x_offset = 1,
y_offset = 1
) +
ggflags::geom_flag(
ggplot2::aes(country = iso2),
size = 3.5,
na.rm = TRUE
) +
ggplot2::theme(
legend.position = "none",
panel.grid.major.y = ggplot2::element_line(color = "#ECF3F3", linewidth = 0.4),
axis.line.x = element_blank()
) +
ggplot2::guides(alpha = "none") +
ggplot2::coord_flip() +
ggplot2::scale_x_reverse() +
ggplot2::scale_y_reverse()Error in eval(e[[2L]], where): object 'rho_rank_w_whr' not found
MPWB Sum by Income Decile across Countries
df_income <-
df_gmh |>
dplyr::filter(
employment_primary != "Student non-working (Full or part-time)",
!base::is.na(income_merg_cat)
) |>
dplyr::mutate(
iso2 = base::tolower(iso2)
)
# income weighted means
means_cty_income <-
df_income |>
dplyr::group_by(country, income_merg_cat, iso2) |>
dplyr::summarise(
n = dplyr::n(),
mpwb_sum_income = Hmisc::wtd.mean(mpwb_sum, weights = ps_weight, na.rm = TRUE),
.groups = "drop"
) |>
dplyr::mutate(
lane_label = base::as.character(income_merg_cat),
flagged = country %in% flagged_countries
) |>
# remove groups with less than 20 participants (unweighted)
dplyr::filter(n >= 20)
eu_income <-
df_income |>
dplyr::filter(country %in% eu_countries) |>
dplyr::group_by(income_merg_cat) |>
dplyr::summarise(
mpwb_sum_income = Hmisc::wtd.mean(mpwb_sum, weights = ps_weight, na.rm = TRUE),
.groups = "drop"
) |>
dplyr::mutate(
country = "EU",
iso2 = "eu",
lane_label = base::as.character(income_merg_cat),
n = NA_integer_,
flagged = FALSE
)
means_cty_income <-
dplyr::bind_rows(means_cty_income, eu_income)
# lane layout
lane_income <-
tibble::tibble(
x_id = base::seq_along(income_order),
label = income_order,
x_pos = (x_id - 1) * center_step
)Error: object 'income_order' not found
bg_income <- make_bg(
lane_income,
y_min_sum,
y_max_sum,
dy_native = dy,
native_to_10_70 = to10_70_sum
)Error: object 'lane_income' not found
Error: object 'lane_income' not found
mean_lines_income <-
df_income |>
dplyr::group_by(income_merg_cat) |>
dplyr::summarise(
y = Hmisc::wtd.mean(mpwb_sum, weights = ps_weight, na.rm = TRUE),
.groups = "drop"
) |>
dplyr::mutate(label = base::as.character(income_merg_cat)) |>
dplyr::left_join(lane_income, by = "label") |>
dplyr::transmute(
xmin = x_pos - lane_width,
xmax = x_pos + lane_width,
y = y
)Error: object 'lane_income' not found
points_income <-
means_cty_income |>
dplyr::mutate(
x_id = base::match(lane_label, lane_income$label),
x_pos = (x_id - 1) * center_step,
x_jit = x_pos + stats::runif(dplyr::n(), -0.7 * lane_width, 0.7 * lane_width),
y_val = mpwb_sum_income
)Error in `dplyr::mutate()`:
ℹ In argument: `x_id = base::match(lane_label, lane_income$label)`.
Caused by error:
! object 'lane_income' not found
ggplot2::ggplot(points_income, ggplot2::aes(x = x_jit, y = y_val)) +
ggplot2::geom_rect(
data = bg_income,
ggplot2::aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, fill = fill),
inherit.aes = FALSE,
color = NA
) +
ggplot2::scale_fill_identity() +
ggplot2::geom_rect(
data = geo_income$overlay,
ggplot2::aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, alpha = overlay_alpha),
inherit.aes = FALSE,
fill = "white",
color = NA
) +
ggplot2::scale_alpha_identity() +
ggplot2::geom_segment(
data = geo_income$edges[-base::nrow(geo_income$edges),],
ggplot2::aes(x = x, xend = xend, y = y, yend = yend),
inherit.aes = FALSE,
color = "#b5bec9",
linewidth = 0.2
) +
ggplot2::geom_segment(
data = mean_lines_income,
ggplot2::aes(x = xmin, xend = xmax, y = y, yend = y),
inherit.aes = FALSE,
color = "#9cacbc",
linewidth = 0.4,
linetype = "solid"
) +
ggplot2::geom_point(
shape = 21,
size = 3.2,
color = "#0B2E55",
na.rm = TRUE
) +
ggplot2::geom_point(
data = dplyr::filter(points_income, flagged),
shape = 21,
size = 3.2,
color = "#b5bec9",
na.rm = TRUE
) +
ggflags::geom_flag(ggplot2::aes(country = iso2), size = 3, na.rm = TRUE) +
ggplot2::scale_x_continuous(
limits = c(
base::min(lane_income$x_pos - lane_width),
base::max(lane_income$x_pos + lane_width)
),
breaks = lane_income$x_pos,
labels = lane_income$label,
expand = c(0,0)
) +
ggplot2::scale_y_continuous(
breaks = base::seq(25, 60, by = 5),
expand = c(0,0)
) +
ggplot2::labs(
x = NULL,
y = "MPWB sum\n"
) +
ggplot2::coord_cartesian(ylim = c(y_min_sum, y_max_sum), clip = "off")Error: object 'points_income' not found
A13. Differences in well-being between main groups.
Figure 4
# Relabel variables for Figure 4
df_fig4 <- df_gmh
# Sex: Additional spaces to force desired balanced
df_fig4$sex_reviewed_cat <- base::factor(
df_fig4$sex_reviewed_cat,
levels = c("", "Male", " ", " ", "Female"," "," ", "Other"," ")
)
# Employment: Add line breaks for better fitting
emp_lvls <- levels(df_fig4$employment_primary)
emp_new <- emp_lvls |>
stringr::str_replace(
"Employed/working full-time \\(25\\+ hours per week\\)",
"\nEmployed/working full-time\n(25+ hours per week)\n"
) |>
stringr::str_replace(
"Employed/working part-time \\(less than 25 hours per week\\)",
"Employed/working part-time\n(less than 25 hours per week)"
) |>
stringr::str_replace(
"^Full-time Student / Part-time Student without employment$",
"Student non-working\n(Full or part-time)"
) |>
stringr::str_replace(
"^Not in paid employment \\(by choice/health\\)$",
"Not in paid employment\n(by choice/health)\n"
) |>
stringr::str_replace(
"^Not in paid employment \\(looking for work\\)$",
"Not in paid employment\n(looking for work)\n"
)
df_fig4$employment_primary <-
base::factor(df_fig4$employment_primary, levels = emp_lvls, labels = emp_new)
# Education: Add line breaks for better fitting
edu_lvls <- levels(df_fig4$education_recoded_cat)
edu_new <- edu_lvls |>
stringr::str_replace(
"Less than secondary",
"Less than\nsecondary"
)
df_fig4$education_recoded_cat <-
base::factor(df_fig4$education_recoded_cat, levels = edu_lvls, labels = edu_new)
# Work arrangement: Add line breaks for better fitting
work_lvls <- levels(df_fig4$work_arrangement_cat_nostudents)
work_new <- work_lvls |>
stringr::str_replace(
"^I work entirely in-person \\(i\\.e\\., in an office, on-site\\)$",
"I work entirely in-person\n(i.e., in an office, on-site)"
) |>
stringr::str_replace(
"^I work about evenly in-person/remote$",
"I work about evenly\nin-person/remote"
) |>
stringr::str_replace(
"^I mostly work remotely, with occasional in-person days$",
"I mostly work remotely,\nwith occasional in-person days"
) |>
stringr::str_replace(
"^I mostly work in-person, with occasional remote days$",
"I mostly work in-person,\nwith occasional remote days"
)
df_fig4$work_arrangement_cat_nostudents <-
base::factor(df_fig4$work_arrangement_cat_nostudents,
levels = work_lvls, labels = work_new)
# Citizenship: Add line breaks for better fitting
cit_lvls <- levels(df_fig4$citizenship_cat)
cit_new <- cit_lvls |>
stringr::str_replace(
"^Non-citizen \\(Permanent Resident\\)$",
"Non-citizen (Permanent\nResident)"
) |>
stringr::str_replace(
"^Born outside country \\(Citizen\\)$",
"Born outside country\n(Citizen)\n"
) |>
stringr::str_replace(
"^Born outside country \\(Non-citizen, Permanent Resident\\)$",
"Born outside country\n(Non-citizen,\nPermanent Resident)\n"
) |>
stringr::str_replace(
"^Born outside country \\(Non-citizen, Non-permanent Resident\\)$",
"\nBorn outside country\n(Non-citizen,\nNon-permanent Resident)\n"
)
df_fig4$citizenship_cat <- factor(df_fig4$citizenship_cat, levels = cit_lvls, labels = cit_new)
# Compute weighted means and CIs per group
# We will use survey package for this for simplicity in computing CIs
dsgn <- survey::svydesign(ids = ~1, weights = ~ps_weight, data = df_fig4)
weighted_summary <- function(varname, label) {
lvls <- levels(df_fig4[[varname]])
tmp <- survey::svyby(
~mpwb_sum, as.formula(paste0("~", varname)),
dsgn, survey::svymean, vartype = "ci", keep.names = FALSE
) |>
as.data.frame()
names(tmp)[1] <- "group"
tibble::tibble(group = lvls) |>
dplyr::left_join(tmp, by = "group") |>
dplyr::mutate(variable = label, group = factor(group, levels = rev(lvls)))
}
# Assemble
all_sum <- dplyr::bind_rows(
weighted_summary("age_group", "Age"),
weighted_summary("sex_reviewed_cat", "Sex"),
weighted_summary("education_recoded_cat", "Education Level"),
weighted_summary("employment_primary", "Employment Status"),
weighted_summary("income_merg_group", "Household Income Level"),
weighted_summary("childhood_SES_cat", "Childhood Socioeconomic Status"),
weighted_summary("citizenship_cat", "Citizenship"),
weighted_summary("household_size_group", "Household Size"),
weighted_summary("work_arrangement_cat_nostudents", "Work Arrangement")
) |>
dplyr::mutate(
variable = base::factor(
variable,
levels = c(
"Age",
"Sex",
"Education Level",
"Employment Status",
"Household Income Level",
"Childhood Socioeconomic Status",
"Citizenship",
"Household Size",
"Work Arrangement"
)
)
) |>
dplyr::group_by(variable) |>
dplyr::arrange(group, .by_group = TRUE) |>
dplyr::mutate(
alt = dplyr::row_number() %% 2,
point_c = ifelse(alt == 1, "#051265", "#16786d"),
ci_c = ifelse(alt == 1, "#4d87ff", "#4fd1c4")
) |>
dplyr::ungroup()
# Plot
p_fig4 <-
ggplot2::ggplot(all_sum, ggplot2::aes(y = group, x = mpwb_sum)) +
ggplot2::geom_vline(
xintercept = mean_mpwb, color = "#BCCBCA", linewidth = 0.45, linetype = "dashed"
) +
ggplot2::geom_errorbar(
ggplot2::aes(xmin = ci_l, xmax = ci_u, color = I(ci_c)),
orientation = "y", linewidth = 0.8, width = 0.15, na.rm = TRUE
) +
ggplot2::geom_point(
ggplot2::aes(color = I(point_c)), size = 1.1, stroke = 0.2, na.rm = TRUE
) +
ggfx::with_shadow(
geom_point(aes(color = I(point_c)), size = 1.6, alpha = 1, stroke = 0.2),
sigma = 3, colour = "gray60", x_offset = 1, y_offset = 1
) +
ggplot2::facet_wrap(~variable, ncol = 3, scales = "free_y") +
ggplot2::labs(x = "\nMPWB Sum", y = NULL) +
theme_gmh +
ggplot2::theme(
panel.grid.major.x =
ggplot2::element_line(color = "#ECF3F3", linetype = "solid", linewidth = 0.25),
panel.grid.minor.x =
ggplot2::element_line(color = "#ECF3F3", linetype = "solid", linewidth = 0.25),
strip.text = ggplot2::element_text(color = "#051520", face = "bold"),
plot.margin = ggplot2::margin(15, 25, 15, 15),
panel.spacing.x = grid::unit(6, "lines")
)Error: object 'mean_mpwb' not found
MPWB and Income
df_dem <- df_gmh |>
dplyr::filter(!is.na(income_merg)) |>
dplyr::mutate(
household_size_z = scale(household_size, center = TRUE, scale = TRUE),
income_merg_group_n = dplyr::case_when(
income_merg_group == "No income" ~ 0,
income_merg_group == "Low" ~ 1,
income_merg_group == "Mid" ~ 2,
income_merg_group == "Upper" ~ 3,
income_merg_group == "Wealthiest" ~ 4
)
)
# Correlation between income and mpwb
weighted_corr(df_dem, income_merg_group_n, mpwb_sum) r t p
1 0.127 28.64 <.001
# Regression
svy_dem <-
survey::svydesign(ids = ~ 1, data = df_dem, weights = ~ ps_weight)
model_inc <-
survey::svyglm(mpwb_sum ~ income_merg_group, design = svy_dem)
summary(model_inc)
Call:
svyglm(formula = mpwb_sum ~ income_merg_group, design = svy_dem)
Survey design:
survey::svydesign(ids = ~1, data = df_dem, weights = ~ps_weight)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 46.74625 0.09535 490.258 < 0.0000000000000002 ***
income_merg_group.L 4.52028 0.27462 16.460 < 0.0000000000000002 ***
income_merg_group.Q -0.18397 0.24347 -0.756 0.4499
income_merg_group.C 0.42761 0.16961 2.521 0.0117 *
income_merg_group^4 -0.52972 0.13556 -3.908 0.0000933 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 120.6656)
Number of Fisher Scoring iterations: 2
# A tibble: 1 × 7
Ward_F df1 df2 p r2 cohens_f percent_var_explained
<chr> <int> <dbl> <chr> <chr> <chr> <chr>
1 117.50 4 53077 < .001 0.0168 0.1307 1.6793
# Pairwise comparisons consecutive income groups
emm_inc <- emmeans::emmeans(
model_inc,
~ income_merg_group
); emm_inc income_merg_group emmean SE df lower.CL upper.CL
No income 43.59 0.378 53077 42.85 44.33
Low 45.89 0.113 53077 45.67 46.11
Mid 46.46 0.138 53077 46.20 46.73
Upper 48.21 0.117 53077 47.98 48.44
Wealthiest 49.58 0.197 53077 49.19 49.96
Confidence level used: 0.95
adj_contr <-
emmeans::contrast(
emm_inc,
method = "consec",
adjust = "holm"
)
summary(
adj_contr,
infer = TRUE
) contrast estimate SE df lower.CL upper.CL t.ratio p.value
Low - No income 2.299 0.395 53077 1.313 3.29 5.825 <.0001
Mid - Low 0.575 0.178 53077 0.131 1.02 3.233 0.0012
Upper - Mid 1.743 0.181 53077 1.292 2.19 9.643 <.0001
Wealthiest - Upper 1.371 0.229 53077 0.798 1.94 5.980 <.0001
Confidence level used: 0.95
Conf-level adjustment: bonferroni method for 4 estimates
P value adjustment: holm method for 4 tests
MPWB, Income and Household size
r t p
1 0.126 28.55 <.001
r t p
1 0.129 27.65 <.001
# Regression household size
model_hh_main <-
survey::svyglm(mpwb_sum ~ household_size_group, design = svy_dem)
summary(model_hh_main)
Call:
svyglm(formula = mpwb_sum ~ household_size_group, design = svy_dem)
Survey design:
survey::svydesign(ids = ~1, data = df_dem, weights = ~ps_weight)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 47.36664 0.07059 671.043 < 0.0000000000000002 ***
household_size_group.L 3.73118 0.17145 21.762 < 0.0000000000000002 ***
household_size_group.Q -0.38256 0.16966 -2.255 0.024147 *
household_size_group.C 0.51847 0.13877 3.736 0.000187 ***
household_size_group^4 -0.35829 0.14904 -2.404 0.016220 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 120.138)
Number of Fisher Scoring iterations: 2
# A tibble: 1 × 7
Ward_F df1 df2 p r2 cohens_f percent_var_explained
<chr> <int> <dbl> <chr> <chr> <chr> <chr>
1 148.28 4 53077 < .001 0.0211 0.1468 2.1091
# Pairwise comparisons consecutive household size groups
emm_hh <- emmeans::emmeans(
model_hh_main,
~ household_size_group
); emm_hh household_size_group emmean SE df lower.CL upper.CL
1 44.6 0.139 53077 44.3 44.9
2 46.8 0.125 53077 46.5 47.0
3 47.3 0.165 53077 47.0 47.6
4-5 48.5 0.127 53077 48.2 48.7
6-20 49.6 0.215 53077 49.2 50.1
Confidence level used: 0.95
adj_contr <-
emmeans::contrast(
emm_hh,
method = "consec",
adjust = "holm"
)
summary(
adj_contr,
infer = TRUE
) contrast estimate SE df lower.CL upper.CL t.ratio p.value
2 - 1 2.193 0.187 53077 1.72597 2.66 11.736 <.0001
3 - 2 0.526 0.207 53077 0.00877 1.04 2.540 0.0111
(4-5) - 3 1.178 0.208 53077 0.65769 1.70 5.655 <.0001
(6-20) - (4-5) 1.151 0.250 53077 0.52696 1.77 4.607 <.0001
Confidence level used: 0.95
Conf-level adjustment: bonferroni method for 4 estimates
P value adjustment: holm method for 4 tests
# Regression income and household size
model_inc_hh_main <-
survey::svyglm(mpwb_sum ~ income_merg_group + household_size_z, design = svy_dem)
summary(model_inc_hh_main)
Call:
svyglm(formula = mpwb_sum ~ income_merg_group + household_size_z,
design = svy_dem)
Survey design:
survey::svydesign(ids = ~1, data = df_dem, weights = ~ps_weight)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 46.76915 0.09531 490.715 < 0.0000000000000002 ***
income_merg_group.L 4.04430 0.27596 14.655 < 0.0000000000000002 ***
income_merg_group.Q -0.32627 0.24334 -1.341 0.179987
income_merg_group.C 0.35690 0.16963 2.104 0.035386 *
income_merg_group^4 -0.51333 0.13502 -3.802 0.000144 ***
household_size_z 1.23345 0.06856 17.991 < 0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 119.1795)
Number of Fisher Scoring iterations: 2
Error in solve.default(V): 'a' is 0-diml
# Regression interaction
model_inc_hh_int <-
survey::svyglm(mpwb_sum ~ income_merg_group * household_size_z, design = svy_dem)
summary(model_inc_hh_int)
Call:
svyglm(formula = mpwb_sum ~ income_merg_group * household_size_z,
design = svy_dem)
Survey design:
survey::svydesign(ids = ~1, data = df_dem, weights = ~ps_weight)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 46.78676 0.09772 478.800 < 0.0000000000000002 ***
income_merg_group.L 4.21537 0.28326 14.882 < 0.0000000000000002 ***
income_merg_group.Q -0.35103 0.25021 -1.403 0.160636
income_merg_group.C 0.45699 0.17280 2.645 0.008181 **
income_merg_group^4 -0.51683 0.13517 -3.823 0.000132 ***
household_size_z 1.05511 0.10051 10.498 < 0.0000000000000002 ***
income_merg_group.L:household_size_z -0.11972 0.29405 -0.407 0.683896
income_merg_group.Q:household_size_z -0.71939 0.25789 -2.790 0.005280 **
income_merg_group.C:household_size_z 0.46439 0.17749 2.616 0.008887 **
income_merg_group^4:household_size_z 0.15260 0.13252 1.152 0.249493
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 119.0505)
Number of Fisher Scoring iterations: 2
Error in solve.default(V): 'a' is 0-diml
there are higher-order terms (interactions) in this model
consider setting type = 'predictor'; see ?vif
GVIF Df GVIF^(1/(2*Df))
income_merg_group 1.192992 4 1.022303
household_size_z 2.481244 1 1.575196
income_merg_group:household_size_z 2.765384 4 1.135584
# Simple slopes of income at different household sizes
slopes_inc_hh_int <- emmeans::emtrends(
model_inc_hh_int,
~ income_merg_group,
var = "household_size_z",
adjust = "holm"
)
# Plot
interactions::interact_plot(
model_inc_hh_int,
pred = household_size_z,
modx = income_merg_group,
interval = TRUE
)MPWB, Income and Childhood SES
df_inc_ses <- df_gmh |>
dplyr::filter(!is.na(income_merg) & !is.na(childhood_SES)) |>
dplyr::mutate(
income_merg_group_n = dplyr::case_when(
income_merg_group == "No income" ~ 0,
income_merg_group == "Low" ~ 1,
income_merg_group == "Mid" ~ 2,
income_merg_group == "Upper" ~ 3,
income_merg_group == "Wealthiest" ~ 4
)
)
weighted_corr(df_inc_ses, childhood_SES, mpwb_sum) r t p
1 0.135 26.26 <.001
r t p
1 0.173 34.44 <.001
svy_inc_ses <-
survey::svydesign(
ids = ~1,
data = df_inc_ses,
weights = ~ps_weight
)
model_ses <-
survey::svyglm(
mpwb_sum ~ childhood_SES_cat,
design = svy_inc_ses
)
summary(model_ses)
Call:
svyglm(formula = mpwb_sum ~ childhood_SES_cat, design = svy_inc_ses)
Survey design:
survey::svydesign(ids = ~1, data = df_inc_ses, weights = ~ps_weight)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 46.54214 0.12214 381.060 <0.0000000000000002 ***
childhood_SES_cat.L 4.49242 0.35785 12.554 <0.0000000000000002 ***
childhood_SES_cat.Q -0.43210 0.30947 -1.396 0.163
childhood_SES_cat.C -0.04965 0.22557 -0.220 0.826
childhood_SES_cat^4 0.39595 0.15377 2.575 0.010 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 123.7373)
Number of Fisher Scoring iterations: 2
# A tibble: 1 × 7
Ward_F df1 df2 p r2 cohens_f percent_var_explained
<chr> <int> <dbl> <chr> <chr> <chr> <chr>
1 87.27 4 38108 < .001 0.0190 0.1392 1.9012
# Pairwise comparisons childhood SES groups
emm_ses <- emmeans::emmeans(
model_ses,
~ childhood_SES_cat
); emm_ses childhood_SES_cat emmean SE df lower.CL upper.CL
Poor 43.5 0.260 38108 43.0 44.0
Below average but not poor 45.0 0.156 38108 44.7 45.3
Around average 47.1 0.123 38108 46.8 47.3
Above average but not wealthy 47.9 0.161 38108 47.6 48.2
Wealthy 49.2 0.490 38108 48.2 50.1
Confidence level used: 0.95
adj_contr <-
emmeans::contrast(
emm_ses,
method = "pairwise",
adjust = "holm"
)
summary(
adj_contr,
infer = TRUE
) contrast estimate SE df lower.CL upper.CL t.ratio p.value
Poor - Below average but not poor -1.483 0.303 38108 -2.33 -0.632 -4.893 <.0001
Poor - Around average -3.524 0.287 38108 -4.33 -2.718 -12.265 <.0001
Poor - Above average but not wealthy -4.387 0.306 38108 -5.25 -3.530 -14.358 <.0001
Poor - Wealthy -5.651 0.555 38108 -7.21 -4.094 -10.190 <.0001
Below average but not poor - Around average -2.041 0.199 38108 -2.60 -1.483 -10.269 <.0001
Below average but not poor - Above average but not wealthy -2.904 0.224 38108 -3.53 -2.274 -12.947 <.0001
Below average but not poor - Wealthy -4.168 0.514 38108 -5.61 -2.724 -8.104 <.0001
Around average - Above average but not wealthy -0.863 0.202 38108 -1.43 -0.295 -4.266 0.0001
Around average - Wealthy -2.127 0.505 38108 -3.54 -0.709 -4.211 0.0001
Above average but not wealthy - Wealthy -1.264 0.516 38108 -2.71 0.184 -2.450 0.0143
Confidence level used: 0.95
Conf-level adjustment: bonferroni method for 10 estimates
P value adjustment: holm method for 10 tests
# Income and childhood SES Main effects
model_income_ses <-
survey::svyglm(
mpwb_sum ~ income_merg_group + childhood_SES_cat,
design = svy_inc_ses
)
summary(model_income_ses)
Call:
svyglm(formula = mpwb_sum ~ income_merg_group + childhood_SES_cat,
design = svy_inc_ses)
Survey design:
survey::svydesign(ids = ~1, data = df_inc_ses, weights = ~ps_weight)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 46.25256 0.14290 323.671 < 0.0000000000000002 ***
income_merg_group.L 4.58593 0.31873 14.388 < 0.0000000000000002 ***
income_merg_group.Q -0.29520 0.28515 -1.035 0.300566
income_merg_group.C 0.25879 0.19894 1.301 0.193309
income_merg_group^4 -0.59832 0.16103 -3.716 0.000203 ***
childhood_SES_cat.L 3.69037 0.35855 10.292 < 0.0000000000000002 ***
childhood_SES_cat.Q -0.52252 0.30809 -1.696 0.089888 .
childhood_SES_cat.C -0.06313 0.22405 -0.282 0.778142
childhood_SES_cat^4 0.45573 0.15264 2.986 0.002831 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 121.5788)
Number of Fisher Scoring iterations: 2
Error in solve.default(V): 'a' is 0-diml
# Income and childhood SES Interaction
model_income_ses_int <-
survey::svyglm(
mpwb_sum ~ income_merg_group * childhood_SES_cat,
design = svy_inc_ses
)
summary(model_income_ses_int)
Call:
svyglm(formula = mpwb_sum ~ income_merg_group * childhood_SES_cat,
design = svy_inc_ses)
Survey design:
survey::svydesign(ids = ~1, data = df_inc_ses, weights = ~ps_weight)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 46.26483 0.15304 302.313 < 0.0000000000000002 ***
income_merg_group.L 4.63870 0.39961 11.608 < 0.0000000000000002 ***
income_merg_group.Q -0.15220 0.38343 -0.397 0.691415
income_merg_group.C 0.32351 0.27458 1.178 0.238725
income_merg_group^4 -0.98679 0.29377 -3.359 0.000783 ***
childhood_SES_cat.L 3.53421 0.43822 8.065 0.000000000000000754 ***
childhood_SES_cat.Q -0.68714 0.38177 -1.800 0.071886 .
childhood_SES_cat.C 0.05104 0.29275 0.174 0.861599
childhood_SES_cat^4 0.24021 0.21194 1.133 0.257063
income_merg_group.L:childhood_SES_cat.L -0.93085 1.11900 -0.832 0.405492
income_merg_group.Q:childhood_SES_cat.L 0.22070 1.09446 0.202 0.840188
income_merg_group.C:childhood_SES_cat.L -0.53745 0.78566 -0.684 0.493935
income_merg_group^4:childhood_SES_cat.L -2.10395 0.87946 -2.392 0.016747 *
income_merg_group.L:childhood_SES_cat.Q 0.40277 0.98252 0.410 0.681859
income_merg_group.Q:childhood_SES_cat.Q -0.18342 0.95475 -0.192 0.847651
income_merg_group.C:childhood_SES_cat.Q 0.23270 0.68409 0.340 0.733735
income_merg_group^4:childhood_SES_cat.Q -1.25365 0.75504 -1.660 0.096847 .
income_merg_group.L:childhood_SES_cat.C -0.02718 0.78728 -0.035 0.972464
income_merg_group.Q:childhood_SES_cat.C 1.48587 0.73619 2.018 0.043565 *
income_merg_group.C:childhood_SES_cat.C 0.25358 0.52702 0.481 0.630403
income_merg_group^4:childhood_SES_cat.C -1.39863 0.52391 -2.670 0.007597 **
income_merg_group.L:childhood_SES_cat^4 0.34907 0.59708 0.585 0.558801
income_merg_group.Q:childhood_SES_cat^4 -0.58874 0.53759 -1.095 0.273458
income_merg_group.C:childhood_SES_cat^4 -0.20592 0.38059 -0.541 0.588472
income_merg_group^4:childhood_SES_cat^4 -0.79241 0.32864 -2.411 0.015907 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 121.4091)
Number of Fisher Scoring iterations: 2
Error in solve.default(V): 'a' is 0-diml
Working (Rao-Scott+F) LRT for income_merg_group:childhood_SES_cat
in svyglm(formula = mpwb_sum ~ income_merg_group * childhood_SES_cat,
design = svy_inc_ses)
Working 2logLR = 26.03592 p= 0.058265
(scale factors: 1.7 1.4 1.2 1.1 1.1 1.1 1 0.94 0.92 0.89 0.84 0.79 0.79 0.77 0.72 0.69 ); denominator df= 38088
eff.p AIC deltabar
[1,] 16.88628 291156.6 1.876253
[2,] 49.53248 291168.7 1.981299
A14. Working Conditions and Mental Health
A15. Demographic Trends Within Countries
Figure 5 (MPWB Scores)
fig5_palette <-
c("#33d9c6", "#ffa954", "#4dcfff", "#0a4c79", "#a883b7", "#f0b0f0", "#051520")
svy_all <- survey::svydesign(
ids = ~1,
weights = ~ps_weight,
data = df_gmh
)
country_means <-
survey::svyby(
~mpwb_sum,
~country,
svy_all,
survey::svymean,
na.rm = TRUE
) |>
tibble::as_tibble() |>
dplyr::transmute(
country,
mean = mpwb_sum
)
svy_eu <- subset(svy_all, country %in% eu_countries)
eu_mean <-
survey::svymean(
~mpwb_sum,
svy_eu,
na.rm = TRUE
) |>
base::as.numeric()
eu_row <- tibble::tibble(
country = "EU",
mean = eu_mean
)
overall_mean_global <-
survey::svymean(
~mpwb_sum,
svy_all,
na.rm = TRUE
) |>
base::as.numeric()
overall_row <- tibble::tibble(
country = "Overall",
mean = overall_mean_global
)
country_order_all <-
dplyr::bind_rows(
country_means,
eu_row,
overall_row
) |>
dplyr::arrange(mean) |>
dplyr::pull(country)
summarise_cells_svy <- function(group_var) {
by_cty_raw <-
survey::svyby(
~mpwb_sum,
stats::as.formula(paste0("~country + ", group_var)),
svy_all,
survey::svymean,
vartype = "se",
na.rm = TRUE,
keep.names = TRUE
) |>
tibble::as_tibble()
by_cty <-
by_cty_raw |>
dplyr::rename(
group = !!rlang::sym(group_var),
mean = mpwb_sum,
se = se
)
cell_stats <-
df_gmh |>
dplyr::group_by(country, !!rlang::sym(group_var)) |>
dplyr::summarise(
n_obs = dplyr::n(),
n_weighted = base::sum(ps_weight),
.groups = "drop"
) |>
dplyr::rename(group = !!rlang::sym(group_var))
by_cty |>
dplyr::left_join(cell_stats, by = c("country", "group")) |>
dplyr::mutate(
# Remove CIs for cells with less than 20 participants (unweighted)
has_ci = !is.na(n_obs) & n_obs >= 20 & is.finite(se),
lo = dplyr::if_else(has_ci, mean - 1.96 * se, NA_real_),
hi = dplyr::if_else(has_ci, mean + 1.96 * se, NA_real_)
)
}
summarise_region_svy <- function(group_var, country_filter = NULL, country_label) {
if (is.null(country_filter)) {
des <- svy_all
df_region <- df_gmh
} else {
des <- subset(svy_all, country %in% country_filter)
df_region <- df_gmh |>
dplyr::filter(country %in% country_filter)
}
est_raw <-
survey::svyby(
~mpwb_sum,
stats::as.formula(paste0("~", group_var)),
des,
survey::svymean,
vartype = "se",
na.rm = TRUE,
keep.names = TRUE
) |>
tibble::as_tibble()
est_raw |>
dplyr::rename(
group = !!rlang::sym(group_var),
mean = mpwb_sum,
se = se
) |>
dplyr::mutate(
n_obs = nrow(df_region),
n_weighted = base::sum(df_region$ps_weight),
lo = mean - 1.96 * se,
hi = mean + 1.96 * se,
country = country_label
)
}
make_panel <- function(group_var, panel_title) {
by_cty <- summarise_cells_svy(group_var)
eu <-
summarise_region_svy(
group_var,
country_filter = eu_countries,
country_label = "EU"
)
overall <-
summarise_region_svy(
group_var,
country_filter = NULL,
country_label = "Overall"
)
dp_raw <-
dplyr::bind_rows(by_cty, eu, overall)
if (!nrow(dp_raw)) return(NULL)
dp <-
dp_raw |>
dplyr::mutate(
country = base::factor(country, levels = country_order_all)
)
pd <- ggplot2::position_dodge(width = 0.35)
overall_position <- which(levels(dp$country) == "Overall")
labels_df <-
dp |>
dplyr::group_by(country) |>
dplyr::summarise(
label = dplyr::first(as.character(country)),
.groups = "drop"
) |>
dplyr::mutate(
label_color = ifelse(label == "Overall", "#4ca3df", "#051520")
)
p <-
ggplot2::ggplot(
dp,
ggplot2::aes(
x = country,
y = mean,
color = group,
group = group
)
) +
ggplot2::geom_hline(
yintercept = overall_mean_global,
linetype = "dashed",
color = "#a1b6c4"
) +
ggplot2::geom_hline(
yintercept = 40,
linewidth = 0.5,
color = "#a1b6c4"
) +
ggplot2::geom_vline(
xintercept = overall_position,
color = "#519fd5",
linetype = "solid",
linewidth = 0.7,
alpha = 0.3
) +
ggplot2::geom_errorbar(
ggplot2::aes(ymin = lo, ymax = hi),
width = 0,
linewidth = 0.4,
position = pd,
na.rm = TRUE
) +
ggfx::with_shadow(
ggplot2::geom_point(
position = pd,
size = 1.6,
stroke = 0,
na.rm = TRUE
),
sigma = 2,
colour = "gray80",
x_offset = 1,
y_offset = 1
) +
ggplot2::geom_point(
size = 1,
position = pd,
na.rm = TRUE
) +
ggplot2::scale_x_discrete(drop = FALSE) +
ggplot2::scale_y_continuous(limits = c(29, 61)) +
ggplot2::geom_text(
data = labels_df,
ggplot2::aes(
x = country,
y = 29,
label = label
),
inherit.aes = FALSE,
color = labels_df$label_color,
angle = -40,
hjust = 0,
vjust = 2,
family = "Inter",
fontface = "bold",
size = 3
) +
ggplot2::scale_color_manual(
values = fig5_palette,
drop = FALSE
) +
ggplot2::labs(
title = panel_title,
x = NULL,
y = NULL,
color = NULL
) +
ggplot2::coord_cartesian(clip = "off") +
ggplot2::theme(
panel.border = ggplot2::element_blank(),
panel.grid = ggplot2::element_blank(),
panel.grid.major.y = ggplot2::element_blank(),
panel.grid.minor.y = ggplot2::element_blank(),
panel.grid.major.x = ggplot2::element_line(
color = "#ECF3F3",
linewidth = 0.25
),
panel.grid.minor.x = ggplot2::element_blank(),
axis.text.x = ggplot2::element_blank(),
axis.ticks = ggplot2::element_blank(),
axis.line.x = ggplot2::element_blank(),
legend.position = "top",
legend.direction = "horizontal",
legend.box.just = "left",
legend.justification = c(0, 0.5),
legend.key = ggplot2::element_blank(),
legend.text = ggplot2::element_text(
color = "#051520",
size = 9,
margin = ggplot2::margin(r = 8)
),
legend.box.margin = ggplot2::margin(l = -20),
plot.title = ggplot2::element_text(
face = "bold",
color = "#051520"
),
plot.title.position = "plot",
plot.margin = ggplot2::margin(25, 60, 60, 15)
) +
ggplot2::guides(colour = ggplot2::guide_legend(nrow = 1))
p
}
p_sum_age <- make_panel("age_group", "Age")
p_sum_sex <- make_panel("sex_reviewed_cat", "Sex")
p_sum_emp <- make_panel("employment_primary", "Employment Status")
p_sum_edu <- make_panel("education_recoded_cat", "Education Level")
title_sum <-
cowplot::ggdraw() +
cowplot::draw_label(
"MPWB Sum",
fontface = "bold",
size = 12,
angle = 90,
x = 0.5,
y = 0.5
)
grid_sum <-
cowplot::plot_grid(
p_sum_age,
p_sum_sex,
p_sum_emp,
p_sum_edu,
ncol = 1,
align = "hv"
)Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Factor Scores
fig5_palette <-
c("#33d9c6", "#ffa954", "#4dcfff", "#0a4c79", "#a883b7", "#f0b0f0", "#051520")
svy_all <- survey::svydesign(
ids = ~1,
weights = ~ps_weight,
data = df_gmh
)
country_means <-
survey::svyby(
~mpwb_factor_within,
~country,
svy_all,
survey::svymean,
na.rm = TRUE
) |>
tibble::as_tibble() |>
dplyr::transmute(
country,
mean = mpwb_factor_within
)
svy_eu <- subset(svy_all, country %in% eu_countries)
eu_mean <-
survey::svymean(
~mpwb_factor_within,
svy_eu,
na.rm = TRUE
) |>
base::as.numeric()
eu_row <- tibble::tibble(
country = "EU",
mean = eu_mean
)
overall_mean_global <-
survey::svymean(
~mpwb_factor_within,
svy_all,
na.rm = TRUE
) |>
base::as.numeric()
overall_row <- tibble::tibble(
country = "Overall",
mean = overall_mean_global
)
country_order_all <-
dplyr::bind_rows(
country_means,
eu_row,
overall_row
) |>
dplyr::arrange(mean) |>
dplyr::pull(country)
summarise_cells_svy <- function(group_var) {
by_cty_raw <-
survey::svyby(
~mpwb_factor_within,
stats::as.formula(paste0("~country + ", group_var)),
svy_all,
survey::svymean,
vartype = "se",
na.rm = TRUE,
keep.names = TRUE
) |>
tibble::as_tibble()
by_cty <-
by_cty_raw |>
dplyr::rename(
group = !!rlang::sym(group_var),
mean = mpwb_factor_within,
se = se
)
cell_stats <-
df_gmh |>
dplyr::group_by(country, !!rlang::sym(group_var)) |>
dplyr::summarise(
n_obs = dplyr::n(),
n_weighted = base::sum(ps_weight),
.groups = "drop"
) |>
dplyr::rename(group = !!rlang::sym(group_var))
by_cty |>
dplyr::left_join(cell_stats, by = c("country", "group")) |>
dplyr::mutate(
# Remove CIs for cells with less than 20 participants (unweighted)
has_ci = !is.na(n_obs) & n_obs >= 20 & is.finite(se),
lo = dplyr::if_else(has_ci, mean - 1.96 * se, NA_real_),
hi = dplyr::if_else(has_ci, mean + 1.96 * se, NA_real_)
)
}
summarise_region_svy <- function(group_var, country_filter = NULL, country_label) {
if (is.null(country_filter)) {
des <- svy_all
df_region <- df_gmh
} else {
des <- subset(svy_all, country %in% country_filter)
df_region <- df_gmh |>
dplyr::filter(country %in% country_filter)
}
est_raw <-
survey::svyby(
~mpwb_factor_within,
stats::as.formula(paste0("~", group_var)),
des,
survey::svymean,
vartype = "se",
na.rm = TRUE,
keep.names = TRUE
) |>
tibble::as_tibble()
est_raw |>
dplyr::rename(
group = !!rlang::sym(group_var),
mean = mpwb_factor_within,
se = se
) |>
dplyr::mutate(
n_obs = nrow(df_region),
n_weighted = base::sum(df_region$ps_weight),
lo = mean - 1.96 * se,
hi = mean + 1.96 * se,
country = country_label
)
}
make_panel <- function(group_var, panel_title) {
by_cty <- summarise_cells_svy(group_var)
eu <-
summarise_region_svy(
group_var,
country_filter = eu_countries,
country_label = "EU"
)
overall <-
summarise_region_svy(
group_var,
country_filter = NULL,
country_label = "Overall"
)
dp_raw <-
dplyr::bind_rows(by_cty, eu, overall)
if (!nrow(dp_raw)) return(NULL)
dp <-
dp_raw |>
dplyr::mutate(
country = base::factor(country, levels = country_order_all)
)
pd <- ggplot2::position_dodge(width = 0.35)
overall_position <- which(levels(dp$country) == "Overall")
labels_df <-
dp |>
dplyr::group_by(country) |>
dplyr::summarise(
label = dplyr::first(as.character(country)),
.groups = "drop"
) |>
dplyr::mutate(
label_color = ifelse(label == "Overall", "#4ca3df", "#051520")
)
p <-
ggplot2::ggplot(
dp,
ggplot2::aes(
x = country,
y = mean,
color = group,
group = group
)
) +
ggplot2::geom_hline(
yintercept = overall_mean_global,
linetype = "dashed",
color = "#a1b6c4"
) +
ggplot2::geom_hline(
yintercept = 40,
linewidth = 0.5,
color = "#a1b6c4"
) +
ggplot2::geom_vline(
xintercept = overall_position,
color = "#519fd5",
linetype = "solid",
linewidth = 0.7,
alpha = 0.3
) +
ggplot2::geom_errorbar(
ggplot2::aes(ymin = lo, ymax = hi),
width = 0,
linewidth = 0.4,
position = pd,
na.rm = TRUE
) +
ggfx::with_shadow(
ggplot2::geom_point(
position = pd,
size = 1.6,
stroke = 0,
na.rm = TRUE
),
sigma = 2,
colour = "gray80",
x_offset = 1,
y_offset = 1
) +
ggplot2::geom_point(
size = 1,
position = pd,
na.rm = TRUE
) +
ggplot2::scale_x_discrete(drop = FALSE) +
ggplot2::scale_y_continuous(limits = c(29, 61)) +
ggplot2::geom_text(
data = labels_df,
ggplot2::aes(
x = country,
y = 29,
label = label
),
inherit.aes = FALSE,
color = labels_df$label_color,
angle = -40,
hjust = 0,
vjust = 2,
family = "Inter",
fontface = "bold",
size = 3
) +
ggplot2::scale_color_manual(
values = fig5_palette,
drop = FALSE
) +
ggplot2::labs(
title = panel_title,
x = NULL,
y = NULL,
color = NULL
) +
ggplot2::coord_cartesian(clip = "off") +
ggplot2::theme(
panel.border = ggplot2::element_blank(),
panel.grid = ggplot2::element_blank(),
panel.grid.major.y = ggplot2::element_blank(),
panel.grid.minor.y = ggplot2::element_blank(),
panel.grid.major.x = ggplot2::element_line(
color = "#ECF3F3",
linewidth = 0.25
),
panel.grid.minor.x = ggplot2::element_blank(),
axis.text.x = ggplot2::element_blank(),
axis.ticks = ggplot2::element_blank(),
axis.line.x = ggplot2::element_blank(),
legend.position = "top",
legend.direction = "horizontal",
legend.box.just = "left",
legend.justification = c(0, 0.5),
legend.key = ggplot2::element_blank(),
legend.text = ggplot2::element_text(
color = "#051520",
size = 9,
margin = ggplot2::margin(r = 8)
),
legend.box.margin = ggplot2::margin(l = -20),
plot.title = ggplot2::element_text(
face = "bold",
color = "#051520"
),
plot.title.position = "plot",
plot.margin = ggplot2::margin(25, 60, 60, 15)
) +
ggplot2::guides(colour = ggplot2::guide_legend(nrow = 1))
p
}
p_sum_age <- make_panel("age_group", "Age")
p_sum_sex <- make_panel("sex_reviewed_cat", "Sex")
p_sum_emp <- make_panel("employment_primary", "Employment Status")
p_sum_edu <- make_panel("education_recoded_cat", "Education Level")
title_sum <-
cowplot::ggdraw() +
cowplot::draw_label(
"MPWB Factor Scores",
fontface = "bold",
size = 12,
angle = 90,
x = 0.5,
y = 0.5
)
grid_sum <-
cowplot::plot_grid(
p_sum_age,
p_sum_sex,
p_sum_emp,
p_sum_edu,
ncol = 1,
align = "hv"
)Warning: Removed 1 row containing missing values or values outside the scale range (`geom_hline()`).
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning: Removed 1 row containing missing values or values outside the scale range (`geom_hline()`).
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning: Removed 1 row containing missing values or values outside the scale range (`geom_hline()`).
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning: Removed 1 row containing missing values or values outside the scale range (`geom_hline()`).
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font family 'Inter' not found in PostScript font database
MPWB and Sex Within Countries
df_sex <-
df_gmh |>
filter(!is.na(sex_binary_cat))
# Compute weighted means by country
df_sex_cty <- df_sex |>
group_by(country, sex_binary_cat) |>
summarise(
mean_mpwb = Hmisc::wtd.mean(mpwb_sum, weights = ps_weight, na.rm = TRUE)
)
# Convert to wide for difference
df_sex_diff <-
df_sex_cty |>
tidyr::pivot_wider(
names_from = sex_binary_cat,
values_from = mean_mpwb
) |>
mutate(
diff_female_male = Female - Male
)
df_sex_diff |>
print_reactable(sorted_col = "country", width = 800) Min. 1st Qu. Median Mean 3rd Qu. Max.
-4.9109 -2.1417 -0.9577 -0.8620 0.2378 3.5527
[1] 1.862793
# Global model
svy_sex <- survey::svydesign(ids = ~1, data = df_sex, weights = ~ps_weight)
model_sex <- survey::svyglm(
mpwb_sum ~ sex_binary,
design = svy_sex
)
base::summary(model_sex)
Call:
svyglm(formula = mpwb_sum ~ sex_binary, design = svy_sex)
Survey design:
survey::svydesign(ids = ~1, data = df_sex, weights = ~ps_weight)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 46.61058 0.08302 561.455 < 0.0000000000000002 ***
sex_binary 0.78132 0.13519 5.779 0.00000000754 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 121.7702)
Number of Fisher Scoring iterations: 2
# A tibble: 1 × 7
Ward_F df1 df2 p r2 cohens_f percent_var_explained
<chr> <int> <dbl> <chr> <chr> <chr> <chr>
1 33.40 1 53343 < .001 0.0012 0.0352 0.1240
# Within-country
# Loop per country and fit svyglm
country_diffs <- df_sex |>
dplyr::group_by(country) |>
dplyr::group_modify(~ {
des_sub <- srvyr::as_survey_design(.x, weights = ps_weight)
fit <- survey::svyglm(mpwb_sum ~ sex_binary, design = des_sub)
tt <- broom::tidy(fit)
row <- tt |> dplyr::filter(term == "sex_binary")
w_n <- sum(.x$ps_weight, na.rm = TRUE)
tibble::tibble(
diff = row$estimate,
se = row$std.error,
p_value = row$p.value,
w_n = w_n
)
}) |>
dplyr::ungroup() |>
dplyr::mutate(
ci_low = diff - 1.96 * se,
ci_high = diff + 1.96 * se
)
# Summaries across countries
country_diffs |>
dplyr::summarise(
n_countries = dplyr::n(),
prop_positive = mean(diff > 0, na.rm = TRUE),
prop_negative = mean(diff < 0, na.rm = TRUE),
median_diff = median(diff, na.rm = TRUE),
median_abs_diff = median(abs(diff), na.rm = TRUE),
iqr_abs_diff = IQR(abs(diff), na.rm = TRUE)
) # A tibble: 1 × 6
n_countries prop_positive prop_negative median_diff median_abs_diff iqr_abs_diff
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 92 0.696 0.304 0.958 1.37 1.85
# Count countries with CIs excluding zero
country_diffs |>
dplyr::mutate(
significant_pos = (ci_low > 0),
significant_neg = (ci_high < 0)
) |>
dplyr::summarise(
n_total = dplyr::n(),
n_sig_pos = sum(significant_pos, na.rm = TRUE),
n_sig_neg = sum(significant_neg, na.rm = TRUE),
prop_sig_pos = n_sig_pos / n_total,
prop_sig_neg = n_sig_neg / n_total
)# A tibble: 1 × 5
n_total n_sig_pos n_sig_neg prop_sig_pos prop_sig_neg
<int> <int> <int> <dbl> <dbl>
1 92 18 4 0.196 0.0435
# Random-effects meta-analysis across country
res_re <- metafor::rma(yi = diff, sei = se, data = country_diffs, method = "REML")
summary(res_re)
Random-Effects Model (k = 92; tau^2 estimator: REML)
logLik deviance AIC BIC AICc
-181.1428 362.2857 366.2857 371.3074 366.4220
tau^2 (estimated amount of total heterogeneity): 1.0610 (SE = 0.3761)
tau (square root of estimated tau^2 value): 1.0300
I^2 (total heterogeneity / total variability): 45.65%
H^2 (total variability / sampling variability): 1.84
Test for Heterogeneity:
Q(df = 91) = 162.7624, p-val < .0001
Model Results:
estimate se zval pval ci.lb ci.ub
0.8976 0.1730 5.1889 <.0001 0.5586 1.2367 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
psex <- ggplot(forest_df, aes(x = diff, y = country, text = hover_txt)) +
geom_point() +
geom_errorbarh(aes(xmin = ci_low, xmax = ci_high), height = 0.2) +
geom_vline(xintercept = coef(res_re), linetype = "dashed", color = "#4ca3df") +
labs(
x = "Male − Female Difference (MPWB Sum)",
y = "",
) +
theme(
panel.grid.major.y = ggplot2::element_line(
color = "#ddeded", linewidth = 0.25)
)
plotly::ggplotly(psex, tooltip = "text")MPWB and Age Within Countries
df_age <- df_gmh |>
dplyr::mutate(
age_c = scale(age, center = TRUE, scale = FALSE),
age_c2 = age_c^2
)
table_label(df_age$age)$age
Participant’s age computed from birth year, using 1404 (Solar Hijri) for participants from survey version "FA-IRN", and using 2025 (Gregorian) for all other versions. The age values of the sponsored participants from the dataser provided by hte team representing Ireland were merged.
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
530 1157 1523 1446 1578 1566 1725 2025 1812 1809 1746 1825 1818 1704 1670 1674 1573 1683 1430 1424 1308 1268 1196 1139 1040 1034 906 1022 844 790
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
778 771 753 707 682 621 599 601 539 519 495 467 456 336 333 352 305 271 262 220 204 192 176 131 123 104 93 67 82 65
78 79 80 81 82 83 84 85 86 87 88 89 90 92 93 95 100 <NA>
53 37 34 23 22 15 12 8 5 5 1 1 2 2 1 4 5 0
Class: numeric
# Global model
svy_age <- survey::svydesign(ids = ~1, data = df_age, weights = ~ps_weight)
model_age <- survey::svyglm(
mpwb_sum ~ age,
design = svy_age
)
summary(model_age)
Call:
svyglm(formula = mpwb_sum ~ age, design = svy_age)
Survey design:
survey::svydesign(ids = ~1, data = df_age, weights = ~ps_weight)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 44.612772 0.178537 249.88 <0.0000000000000002 ***
age 0.057142 0.004401 12.98 <0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 122.1458)
Number of Fisher Scoring iterations: 2
# A tibble: 1 × 7
Ward_F df1 df2 p r2 cohens_f percent_var_explained
<chr> <int> <dbl> <chr> <chr> <chr> <chr>
1 168.55 1 53797 < .001 0.0062 0.0790 0.6196
model_age_quad <- survey::svyglm(
mpwb_sum ~ age_c + age_c2,
design = svy_age
)
summary(model_age_quad)
Call:
svyglm(formula = mpwb_sum ~ age_c + age_c2, design = svy_age)
Survey design:
survey::svydesign(ids = ~1, data = df_age, weights = ~ps_weight)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 46.5937198 0.0895137 520.521 <0.0000000000000002 ***
age_c 0.0474940 0.0055845 8.505 <0.0000000000000002 ***
age_c2 0.0006757 0.0002856 2.366 0.018 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 122.1173)
Number of Fisher Scoring iterations: 2
Error in solve.default(V): 'a' is 0-diml
Wald test for age_c age_c2 - age
in svyglm(formula = mpwb_sum ~ age_c + age_c2, design = svy_age)
F = 5.597306 on 1 and 53796 df: p= 0.017992
eff.p AIC deltabar
[1,] 2.906267 411195.5 1.453134
[2,] 5.151845 411187.5 1.717282
# Global from model
age_grid <-
tibble::tibble(age = base::seq(
base::min(df_age$age, na.rm = TRUE),
base::max(df_age$age, na.rm = TRUE),
by = 1
)) |>
dplyr::mutate(
age_c = scale(age, center = TRUE, scale = FALSE),
age_c2 = age_c^2
)
pred <-
stats::predict(model_age_quad,
newdata = age_grid,
se.fit = TRUE,
type = "response") |>
as.data.frame()
age_grid <-
age_grid |>
dplyr::mutate(
fit = pred$response,
se = pred$SE,
lo = fit - 1.96 * se,
hi = fit + 1.96 * se
)
ggplot(age_grid, aes(x = age, y = fit)) +
geom_ribbon(aes(ymin = lo, ymax = hi), alpha = 0.2, fill = "#6F7C91") +
geom_line(linewidth = 0.8, color = "#082444") +
labs(x = "Age", y = "Predicted MPWB sum")# Within-country
# Compute weighted means by country
df_age_cty <- df_age |>
group_by(country, age_group) |>
summarise(
mean_mpwb = Hmisc::wtd.mean(mpwb_sum, weights = ps_weight, na.rm = TRUE)
) |>
print_reactable(sorted_col = "country", width = 800)
# Within-country quadratic models
df_age_ctry <-
df_age |>
dplyr::group_by(country) |>
dplyr::mutate(
age_c = scale(age, center = TRUE, scale = FALSE),
age_c2 = age_c^2
) |>
dplyr::ungroup()
age_models <-
df_age_ctry |>
dplyr::group_by(country) |>
dplyr::group_modify(~ {
des_sub <- srvyr::as_survey_design(.x, weights = ps_weight)
fit <- survey::svyglm(
mpwb_sum ~ age_c + age_c2,
design = des_sub
)
broom::tidy(fit) |>
dplyr::filter(term %in% c("age_c", "age_c2"))
}) |>
dplyr::ungroup()
country_age <-
age_models |>
tidyr::pivot_wider(
id_cols = c(country),
names_from = term,
values_from = c(estimate, std.error, p.value),
names_glue = "{term}_{.value}"
) |>
dplyr::transmute(
country,
beta_age_c = age_c_estimate,
se_age_c = age_c_std.error,
p_age_c = age_c_p.value,
beta_age_c2 = age_c2_estimate,
se_age_c2 = age_c2_std.error,
p_age_c2 = age_c2_p.value,
w_n = w_n,
ci_low_age_c = beta_age_c - 1.96 * se_age_c,
ci_high_age_c = beta_age_c + 1.96 * se_age_c,
ci_low_age_c2 = beta_age_c2 - 1.96 * se_age_c2,
ci_high_age_c2 = beta_age_c2 + 1.96 * se_age_c2
)Error in `dplyr::transmute()`:
ℹ In argument: `w_n = w_n`.
Caused by error:
! object 'w_n' not found
# Count countries with CIs for age_c excluding zero
country_age |>
dplyr::ungroup() |>
dplyr::mutate(
significant_pos = ci_low_age_c > 0,
significant_neg = ci_high_age_c < 0
) |>
dplyr::summarise(
n_total = dplyr::n(),
n_sig_pos = base::sum(significant_pos, na.rm = TRUE),
n_sig_neg = base::sum(significant_neg, na.rm = TRUE),
prop_sig_pos = n_sig_pos / n_total,
prop_sig_neg = n_sig_neg / n_total
)Error: object 'country_age' not found
# Random-effects meta-analysis
res_age_c <-
metafor::rma(
yi = beta_age_c,
sei = se_age_c,
data = country_age,
method = "REML"
)Error: object 'country_age' not found
Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'summary': object 'res_age_c' not found
res_age_c2 <-
metafor::rma(
yi = beta_age_c2,
sei = se_age_c2,
data = country_age,
method = "REML"
)Error: object 'country_age' not found
Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'summary': object 'res_age_c2' not found
# Forest plot for linear age effect
forest_age <-
country_age |>
dplyr::arrange(-beta_age_c) |>
dplyr::mutate(
country = base::factor(country),
hover_txt = sprintf(
"%s<br>Slope age_c: %.2f",
stringr::str_to_title(country),
base::round(beta_age_c, 2)
)
)Error: object 'country_age' not found
page <- ggplot(forest_age, aes(x = beta_age_c, y = stats::reorder(country, beta_age_c), text = hover_txt)) +
geom_point() +
geom_errorbarh(aes(xmin = ci_low_age_c, xmax = ci_high_age_c), height = 0.2) +
geom_vline(xintercept = coef(res_age_c), linetype = "dashed", color = "#4ca3df") +
labs(
x = "Age Slopes for MPWB Sum",
y = "",
) +
theme(
panel.grid.major.y = ggplot2::element_line(
color = "#ddeded", linewidth = 0.25)
)Error: object 'forest_age' not found
Error in UseMethod("ggplotly", p): no applicable method for 'ggplotly' applied to an object of class "function"
# Plot per-country age quadratic term
age_curves_cty <-
df_age |>
dplyr::group_by(country) |>
dplyr::group_modify( ~ {
# subset data for this country
age_min <- base::floor(base::min(.x$age, na.rm = TRUE))
age_max <- base::ceiling(base::max(.x$age, na.rm = TRUE))
data_sub <-
.x |>
dplyr::mutate(age_c = scale(age, center = TRUE, scale = FALSE),
age_c2 = age_c^2)
des_sub <-
srvyr::as_survey_design(data_sub, weights = ps_weight)
fit_sub <-
survey::svyglm(mpwb_sum ~ age_c + age_c2, design = des_sub)
age_grid <-
tibble::tibble(age = base::seq(age_min, age_max, by = 1)) |>
dplyr::mutate(age_c = scale(age, center = TRUE, scale = FALSE),
age_c2 = age_c^2)
pred <-
stats::predict(fit_sub,
newdata = age_grid,
se.fit = TRUE,
type = "response") |> as.data.frame()
age_grid |>
dplyr::mutate(
fit = pred$response,
se = pred$SE,
lo = fit - 1.96 * se,
hi = fit + 1.96 * se
)
}) |>
dplyr::ungroup()
ggplot2::ggplot(age_curves_cty, ggplot2::aes(x = age, y = fit)) +
ggplot2::geom_ribbon(
ggplot2::aes(ymin = lo, ymax = hi), alpha = 0.2, fill = "#6F7C91") +
ggplot2::geom_line(linewidth = 0.6, color = "#082444") +
ggplot2::facet_wrap(
~ country, scales = "free", ncol = 4, nrow = 25) +
ggplot2::labs(x = "Age", y = "Predicted MPWB sum")MPWB and Employment Within Countries
df_empl <-
df_gmh |>
dplyr::filter(!is.na(employment_primary))
# Global
svy_empl <-
survey::svydesign(
ids = ~1,
data = df_empl,
weights = ~ps_weight
)
model_empl <-
survey::svyglm(mpwb_sum ~ employment_primary, design = svy_empl)
summary(model_empl)
Call:
svyglm(formula = mpwb_sum ~ employment_primary, design = svy_empl)
Survey design:
survey::svydesign(ids = ~1, data = df_empl, weights = ~ps_weight)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 42.6006 0.2974 143.262 <0.0000000000000002 ***
employment_primaryNot in paid employment (looking for work) 0.9126 0.3854 2.368 0.0179 *
employment_primaryStudent non-working (Full or part-time) 4.7654 0.3384 14.084 <0.0000000000000002 ***
employment_primaryEmployed/working full-time (25+ hours per week) 4.7769 0.3100 15.410 <0.0000000000000002 ***
employment_primaryEmployed/working part-time (less than 25 hours per week) 4.9788 0.3515 14.165 <0.0000000000000002 ***
employment_primaryRetired 5.7336 0.4007 14.308 <0.0000000000000002 ***
employment_primaryMilitary service 8.8359 0.5890 15.002 <0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 120.0555)
Number of Fisher Scoring iterations: 2
# A tibble: 1 × 7
Ward_F df1 df2 p r2 cohens_f percent_var_explained
<chr> <int> <dbl> <chr> <chr> <chr> <chr>
1 92.49 6 53761 < .001 0.0235 0.1551 2.3493
employment_primary emmean SE df lower.CL upper.CL
Not in paid employment (by choice/health) 42.60 0.2970 53761 42.02 43.18
Not in paid employment (looking for work) 43.51 0.2450 53761 43.03 43.99
Student non-working (Full or part-time) 47.37 0.1610 53761 47.05 47.68
Employed/working full-time (25+ hours per week) 47.38 0.0876 53761 47.21 47.55
Employed/working part-time (less than 25 hours per week) 47.58 0.1870 53761 47.21 47.95
Retired 48.33 0.2690 53761 47.81 48.86
Military service 51.44 0.5080 53761 50.44 52.43
Confidence level used: 0.95
adj_contr <-
emmeans::contrast(
emm_empl,
method = "pairwise",
adjust = "holm"
)
summary(
adj_contr,
infer = TRUE
) contrast estimate SE df lower.CL
(Not in paid employment (by choice/health)) - Not in paid employment (looking for work) -0.9126 0.385 53761 -2.084
(Not in paid employment (by choice/health)) - (Student non-working (Full or part-time)) -4.7654 0.338 53761 -5.793
(Not in paid employment (by choice/health)) - (Employed/working full-time (25+ hours per week)) -4.7769 0.310 53761 -5.719
(Not in paid employment (by choice/health)) - (Employed/working part-time (less than 25 hours per week)) -4.9788 0.351 53761 -6.047
(Not in paid employment (by choice/health)) - Retired -5.7336 0.401 53761 -6.951
(Not in paid employment (by choice/health)) - Military service -8.8359 0.589 53761 -10.625
Not in paid employment (looking for work) - (Student non-working (Full or part-time)) -3.8529 0.294 53761 -4.745
Not in paid employment (looking for work) - (Employed/working full-time (25+ hours per week)) -3.8644 0.260 53761 -4.655
Not in paid employment (looking for work) - (Employed/working part-time (less than 25 hours per week)) -4.0662 0.309 53761 -5.004
Not in paid employment (looking for work) - Retired -4.8210 0.364 53761 -5.926
Not in paid employment (looking for work) - Military service -7.9233 0.564 53761 -9.638
(Student non-working (Full or part-time)) - (Employed/working full-time (25+ hours per week)) -0.0115 0.184 53761 -0.569
(Student non-working (Full or part-time)) - (Employed/working part-time (less than 25 hours per week)) -0.2133 0.247 53761 -0.965
(Student non-working (Full or part-time)) - Retired -0.9682 0.313 53761 -1.920
(Student non-working (Full or part-time)) - Military service -4.0704 0.533 53761 -5.691
(Employed/working full-time (25+ hours per week)) - (Employed/working part-time (less than 25 hours per week)) -0.2018 0.207 53761 -0.830
(Employed/working full-time (25+ hours per week)) - Retired -0.9567 0.283 53761 -1.815
(Employed/working full-time (25+ hours per week)) - Military service -4.0589 0.516 53761 -5.626
(Employed/working part-time (less than 25 hours per week)) - Retired -0.7548 0.328 53761 -1.750
(Employed/working part-time (less than 25 hours per week)) - Military service -3.8571 0.542 53761 -5.503
Retired - Military service -3.1023 0.575 53761 -4.849
upper.CL t.ratio p.value
0.2585 -2.368 0.0895
-3.7374 -14.084 <.0001
-3.8351 -15.410 <.0001
-3.9108 -14.165 <.0001
-4.5161 -14.308 <.0001
-7.0464 -15.002 <.0001
-2.9609 -13.124 <.0001
-3.0733 -14.841 <.0001
-3.1285 -13.175 <.0001
-3.7160 -13.255 <.0001
-6.2084 -14.037 <.0001
0.5464 -0.063 0.9876
0.5382 -0.863 0.9876
-0.0160 -3.089 0.0120
-2.4498 -7.631 <.0001
0.4266 -0.976 0.9876
-0.0983 -3.386 0.0050
-2.4916 -7.868 <.0001
0.2403 -2.305 0.0895
-2.2108 -7.118 <.0001
-1.3553 -5.395 <.0001
Confidence level used: 0.95
Conf-level adjustment: bonferroni method for 21 estimates
P value adjustment: holm method for 21 tests
plot(emm_empl, adjust = "holm", xlab = "Employment Status",
ylab = "Estimated Marginal Means of MPWB Sum")# Within-country
# Descriptive summaries within countries
df_empl_cty <- df_empl |>
group_by(country, employment_primary) |>
summarise(
mean_mpwb = Hmisc::wtd.mean(mpwb_sum, weights = ps_weight, na.rm = TRUE)
) |>
print_reactable(sorted_col = "country", width = 800)
empl_print_summ_cty <-
df_empl |>
dplyr::group_by(country) |>
dplyr::group_modify(~ {
des_sub <-
srvyr::as_survey_design(.x, weights = ps_weight)
fit_sub <-
survey::svyglm(
mpwb_sum ~ employment_primary,
design = des_sub
)
print_summ(fit_sub, des_sub, "mpwb_sum", "employment_primary")
}) |>
dplyr::ungroup()
empl_print_summ_cty |>
print_reactable(sorted_col = "country", width = 800)# Pairwise comparisons within countries
empl_adj_contr_cty <-
df_empl |>
dplyr::group_by(country) |>
dplyr::group_modify(~ {
des_sub <-
srvyr::as_survey_design(.x, weights = ps_weight)
fit_sub <-
survey::svyglm(
mpwb_sum ~ employment_primary,
design = des_sub
)
emm_sub <-
emmeans::emmeans(
fit_sub,
~ employment_primary,
data = .x
)
adj_sub <-
emmeans::contrast(
emm_sub,
method = "pairwise",
adjust = "holm"
)
summary(
adj_sub,
infer = TRUE
) |>
base::as.data.frame() |>
tibble::as_tibble()
}) |>
dplyr::ungroup()
empl_adj_contr_cty |>
print_reactable(sorted_col = "country", width = 800)# Random effects meta-analysis for contrast
meta_empl_contr <-
empl_adj_contr_cty |>
dplyr::group_by(contrast) |>
dplyr::group_modify(~ {
fit <- metafor::rma(
yi = .x$estimate,
sei = .x$SE,
method = "REML"
)
tibble::tibble(
k = fit$k,
pooled_est = base::round(fit$b[1, 1], 3),
pooled_se = base::round(fit$se[1], 3),
ci_low = base::round(fit$ci.lb, 2),
ci_high = base::round(fit$ci.ub, 2),
z = base::round(fit$zval, 3),
p_value = base::round(fit$pval, 4),
tau2 = base::round(fit$tau2, 3),
i2 = base::round(fit$I2, 2),
h2 = base::round(fit$H2, 2)
)
}) |>
dplyr::ungroup()
meta_empl_contr |>
print_reactable(sorted_col = "contrast", width = 800)MPWB and Education Within Countries
df_edu <-
df_gmh |>
dplyr::filter(!is.na(education_recoded_cat))
# Global
svy_edu <-
survey::svydesign(
ids = ~1,
data = df_edu,
weights = ~ps_weight
)
model_edu <-
survey::svyglm(
mpwb_sum ~ education_recoded_cat,
design = svy_edu)
summary(model_edu)
Call:
svyglm(formula = mpwb_sum ~ education_recoded_cat, design = svy_edu)
Survey design:
survey::svydesign(ids = ~1, data = df_edu, weights = ~ps_weight)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 46.52441 0.09554 486.973 <0.0000000000000002 ***
education_recoded_cat.L 3.54677 0.26074 13.603 <0.0000000000000002 ***
education_recoded_cat.Q 0.16379 0.24292 0.674 0.500
education_recoded_cat.C -0.24390 0.16679 -1.462 0.144
education_recoded_cat^4 -0.22232 0.16654 -1.335 0.182
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 120.6988)
Number of Fisher Scoring iterations: 2
# A tibble: 1 × 7
Ward_F df1 df2 p r2 cohens_f percent_var_explained
<chr> <int> <dbl> <chr> <chr> <chr> <chr>
1 125.80 4 53764 < .001 0.0181 0.1359 1.8128
education_recoded_cat emmean SE df lower.CL upper.CL
Less than secondary 44.42 0.3890 53764 43.66 45.18
Secondary 45.31 0.1400 53764 45.04 45.59
Technical 46.28 0.1910 53764 45.90 46.65
University 47.86 0.0966 53764 47.67 48.05
Advanced 48.75 0.1050 53764 48.54 48.96
Confidence level used: 0.95
adj_contr <-
emmeans::contrast(
emm_edu,
method = "pairwise",
adjust = "holm"
)
summary(
adj_contr,
infer = TRUE
) contrast estimate SE df lower.CL upper.CL t.ratio p.value
Less than secondary - Secondary -0.892 0.414 53764 -2.05 0.270 -2.155 0.0311
Less than secondary - Technical -1.858 0.434 53764 -3.08 -0.640 -4.283 0.0001
Less than secondary - University -3.443 0.401 53764 -4.57 -2.317 -8.583 <.0001
Less than secondary - Advanced -4.332 0.403 53764 -5.46 -3.200 -10.739 <.0001
Secondary - Technical -0.966 0.237 53764 -1.63 -0.301 -4.079 0.0001
Secondary - University -2.552 0.170 53764 -3.03 -2.075 -15.019 <.0001
Secondary - Advanced -3.440 0.175 53764 -3.93 -2.949 -19.656 <.0001
Technical - University -1.585 0.214 53764 -2.19 -0.984 -7.399 <.0001
Technical - Advanced -2.474 0.218 53764 -3.09 -1.861 -11.329 <.0001
University - Advanced -0.889 0.143 53764 -1.29 -0.487 -6.216 <.0001
Confidence level used: 0.95
Conf-level adjustment: bonferroni method for 10 estimates
P value adjustment: holm method for 10 tests
# Within-country
# Descriptive summaries within countries
df_edu_cty <- df_edu |>
group_by(country, education_recoded_cat) |>
summarise(
mean_mpwb = Hmisc::wtd.mean(mpwb_sum, weights = ps_weight, na.rm = TRUE)
) |>
print_reactable(sorted_col = "country", width = 800)
edu_adj_contr_cty <-
df_edu |>
dplyr::group_by(country) |>
dplyr::group_modify(~ {
des_sub <-
srvyr::as_survey_design(.x, weights = ps_weight)
fit_sub <-
survey::svyglm(
mpwb_sum ~ education_recoded_cat,
design = des_sub
)
emm_sub <-
emmeans::emmeans(
fit_sub,
~ education_recoded_cat,
data = .x
)
adj_sub <-
emmeans::contrast(
emm_sub,
method = "pairwise",
adjust = "holm"
)
summary(
adj_sub,
infer = TRUE
) |>
base::as.data.frame() |>
tibble::as_tibble()
}) |>
dplyr::ungroup()
edu_adj_contr_cty |>
print_reactable(sorted_col = "country", width = 800)edu_lesssec_vs_uni <-
edu_adj_contr_cty |>
dplyr::filter(contrast == "Less than secondary - University")
# Random-effects meta-analysis
meta_lesssec_vs_uni <-
metafor::rma(
yi = edu_lesssec_vs_uni$estimate,
sei = edu_lesssec_vs_uni$SE,
method = "REML"
)
summary(meta_lesssec_vs_uni)
Random-Effects Model (k = 86; tau^2 estimator: REML)
logLik deviance AIC BIC AICc
-267.1218 534.2436 538.2436 543.1289 538.3899
tau^2 (estimated amount of total heterogeneity): 22.0573 (SE = 4.6487)
tau (square root of estimated tau^2 value): 4.6965
I^2 (total heterogeneity / total variability): 86.00%
H^2 (total variability / sampling variability): 7.14
Test for Heterogeneity:
Q(df = 85) = 565.4069, p-val < .0001
Model Results:
estimate se zval pval ci.lb ci.ub
-3.1008 0.6040 -5.1334 <.0001 -4.2847 -1.9169 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
A16. Indications of Mental Illness
PHQ-4 Descriptives
df_phq <- df_gmh |>
dplyr::filter(!is.na(gad_worry))
# Global
df_long_phq <-
df_phq |>
dplyr::select(ps_weight, dplyr::starts_with("gad"), dplyr::starts_with("phq")) |>
dplyr::select(-phq4_cat) |>
tidyr::pivot_longer(
cols = -ps_weight,
names_to = "variable",
values_to = "value"
)
summ_phq <-
df_long_phq |>
dplyr::group_by(variable) |>
dplyr::summarise(
unweighted_n = base::round(base::sum(!is.na(value)), 2),
unweighted_mean = base::round(base::mean(value, na.rm = TRUE), 2),
unweighted_sd = base::round(stats::sd(value, na.rm = TRUE), 2),
unweighted_median = base::round(stats::median(value, na.rm = TRUE), 2),
unweighted_iqr = base::round(stats::IQR(value, na.rm = TRUE), 2),
weighted_n = base::round(base::sum(ps_weight), 2),
weighted_mean = base::round(Hmisc::wtd.mean(
value, ps_weight,
na.rm = TRUE
), 2),
weighted_sd = base::round(sqrt(Hmisc::wtd.var(value, ps_weight, na.rm = TRUE)), 2),
q25 = base::round(Hmisc::wtd.quantile(
value,
weights = ps_weight,
probs = 0.25,
na.rm = TRUE,
normwt = FALSE
), 2),
weighted_median = base::round(Hmisc::wtd.quantile(
value,
weights = ps_weight,
probs = 0.5,
na.rm = TRUE,
normwt = FALSE
), 2),
q75 = base::round(Hmisc::wtd.quantile(
value,
weights = ps_weight,
probs = 0.75,
na.rm = TRUE,
normwt = FALSE
), 2),
.groups = "drop"
) |>
dplyr::mutate(
weighted_iqr = q75 - q25
) |>
dplyr::select(
variable,
unweighted_n,
unweighted_mean,
unweighted_sd,
unweighted_median,
unweighted_iqr,
weighted_n,
weighted_mean,
weighted_sd,
weighted_median,
weighted_iqr
)
summ_phq |>
print_reactable(sorted_col = "variable", width = 800)# Within-country
df_long_phq <-
df_phq |>
dplyr::select(
country,
ps_weight,
dplyr::starts_with("gad"),
dplyr::starts_with("phq")
) |>
dplyr::select(-phq4_cat) |>
tidyr::pivot_longer(
cols = -c(country, ps_weight),
names_to = "variable",
values_to = "value"
)
summ_phq_cty <-
df_long_phq |>
dplyr::group_by(country, variable) |>
dplyr::summarise(
unweighted_n = base::round(base::sum(!is.na(value)), 2),
unweighted_mean = base::round(base::mean(value, na.rm = TRUE), 2),
unweighted_sd = base::round(stats::sd(value, na.rm = TRUE), 2),
unweighted_median = base::round(stats::median(value, na.rm = TRUE), 2),
unweighted_iqr = base::round(stats::IQR(value, na.rm = TRUE), 2),
weighted_n = base::round(base::sum(ps_weight[!is.na(value)]), 2),
weighted_mean = base::round(Hmisc::wtd.mean(
value,
weights = ps_weight,
na.rm = TRUE
), 2),
weighted_sd = base::round(base::sqrt(
Hmisc::wtd.var(
value,
weights = ps_weight,
na.rm = TRUE
)
), 2),
q25 = base::round(Hmisc::wtd.quantile(
value,
weights = ps_weight,
probs = 0.25,
na.rm = TRUE,
normwt = FALSE
), 2),
weighted_median = base::round(Hmisc::wtd.quantile(
value,
weights = ps_weight,
probs = 0.5,
na.rm = TRUE,
normwt = FALSE
), 2),
q75 = base::round(Hmisc::wtd.quantile(
value,
weights = ps_weight,
probs = 0.75,
na.rm = TRUE,
normwt = FALSE
), 2),
.groups = "drop"
) |>
dplyr::mutate(
weighted_iqr = q75 - q25
) |>
dplyr::select(
country,
variable,
unweighted_n,
unweighted_mean,
unweighted_sd,
unweighted_median,
unweighted_iqr,
weighted_n,
weighted_mean,
weighted_sd,
weighted_median,
weighted_iqr
)
summ_phq_cty |>
print_reactable(sorted_col = "country", width = 800)Prevalence Estimates (Percentages) of PHQ-2, GAD-2, and PHQ-4
# Global
dplyr::summarise(df_phq,
n_uw = dplyr::n(),
n_w_eff =
(base::sum(ps_weight, na.rm = TRUE)^2) / base::sum(ps_weight^2, na.rm = TRUE),
sum_w =
base::sum(ps_weight, na.rm=TRUE),
phq2_3_unw =
base::round(base::mean(phq2_sum_rec >= 3, na.rm = TRUE) * 100, 2),
phq2_3_w =
base::round(Hmisc::wtd.mean(as.numeric(phq2_sum_rec >= 3), ps_weight) * 100, 2),
gad2_3_unw =
base::round(base::mean(gad2_sum_rec >= 3, na.rm = TRUE) * 100, 2),
gad2_3_w =
base::round(Hmisc::wtd.mean(as.numeric(gad2_sum_rec >= 3), ps_weight) * 100, 2),
phq4_3_unw =
base::round(mean(phq4_sum_rec >= 3 & phq4_sum_rec <=5, na.rm = TRUE) * 100, 2),
phq4_3_w =
base::round(Hmisc::wtd.mean(phq4_sum_rec >= 3 & phq4_sum_rec <=5, ps_weight) * 100, 2),
phq4_6_unw =
base::round(mean(phq4_sum_rec >= 6 & phq4_sum_rec <=8, na.rm = TRUE) * 100, 2),
phq4_6_w =
base::round(Hmisc::wtd.mean(phq4_sum_rec >= 6 & phq4_sum_rec <=8, ps_weight) * 100),
phq4_9_unw =
base::round(mean(phq4_sum_rec >= 9 & phq4_sum_rec <= 12, na.rm = TRUE) * 100, 2),
phq4_9_w =
base::round(Hmisc::wtd.mean(phq4_sum_rec >= 9 & phq4_sum_rec <=12, ps_weight) * 100, 2)
) |>
tidyr::pivot_longer(
cols = everything(),
names_to = "measure",
values_to = "prevalence"
)# A tibble: 13 × 2
measure prevalence
<chr> <dbl>
1 n_uw 38509
2 n_w_eff 20971.
3 sum_w 26610.
4 phq2_3_unw 20.3
5 phq2_3_w 21.0
6 gad2_3_unw 22.1
7 gad2_3_w 22.6
8 phq4_3_unw 55.3
9 phq4_3_w 53.7
10 phq4_6_unw 12.8
11 phq4_6_w 13
12 phq4_9_unw 9.6
13 phq4_9_w 10.2
# Within-country
within_phq <-
df_phq |>
dplyr::group_by(country) |>
dplyr::summarise(
n_uw = dplyr::n(),
n_w_eff =
base::round(
(base::sum(ps_weight, na.rm = TRUE)^2) /
base::sum(ps_weight^2, na.rm = TRUE),
2),
sum_w = base::round(base::sum(ps_weight, na.rm = TRUE), 2),
# PHQ-2
phq2_3_unw =
base::round(base::mean(depression_screen, na.rm = TRUE) * 100, 2),
phq2_3_w =
base::round(Hmisc::wtd.mean(depression_screen, ps_weight, na.rm = TRUE) * 100, 2),
phq2_3_pos_uw =
base::sum(depression_screen == 1, na.rm = TRUE),
phq2_3_pos_sum_w =
base::round(base::sum(ps_weight[depression_screen == 1 & !is.na(depression_screen)], na.rm = TRUE), 0),
# GAD-2
gad2_3_unw =
base::round(base::mean(anxiety_screen, na.rm = TRUE) * 100, 2),
gad2_3_w =
base::round(Hmisc::wtd.mean(anxiety_screen, ps_weight, na.rm = TRUE) * 100, 2),
gad2_3_pos_uw =
base::sum(anxiety_screen == 1, na.rm = TRUE),
gad2_3_pos_w =
base::round(base::sum(ps_weight[anxiety_screen == 1 & !is.na(anxiety_screen)], na.rm = TRUE), 2),
# PHQ-4 mild (3–5)
phq4_3_unw =
base::round(base::mean(phq4_sum_rec >= 3 & phq4_sum_rec <= 5, na.rm = TRUE) * 100, 2),
phq4_3_w =
base::round(Hmisc::wtd.mean(phq4_sum_rec >= 3 & phq4_sum_rec <= 5, ps_weight, na.rm = TRUE) * 100, 2),
phq4_3_pos_uw =
base::sum(phq4_sum_rec >= 3 & phq4_sum_rec <= 5, na.rm = TRUE),
phq4_3_pos_w =
base::round(base::sum(ps_weight[phq4_sum_rec >= 3 & phq4_sum_rec <= 5 & !is.na(phq4_sum_rec)], na.rm = TRUE), 2),
# PHQ-4 moderate (6–8)
phq4_6_unw =
base::round(base::mean(phq4_sum_rec >= 6 & phq4_sum_rec <= 8, na.rm = TRUE) * 100, 2),
phq4_6_w =
base::round(Hmisc::wtd.mean(phq4_sum_rec >= 6 & phq4_sum_rec <= 8, ps_weight, na.rm = TRUE) * 100, 2),
phq4_6_pos_uw =
base::sum(phq4_sum_rec >= 6 & phq4_sum_rec <= 8, na.rm = TRUE),
phq4_6_pos_w =
base::round(base::sum(ps_weight[phq4_sum_rec >= 6 & phq4_sum_rec <= 8 & !is.na(phq4_sum_rec)], na.rm = TRUE), 0),
# PHQ-4 severe (9–12)
phq4_9_unw =
base::round(base::mean(phq4_sum_rec >= 9 & phq4_sum_rec <= 12, na.rm = TRUE) * 100, 2),
phq4_9_w =
base::round(Hmisc::wtd.mean(phq4_sum_rec >= 9 & phq4_sum_rec <= 12, ps_weight, na.rm = TRUE) * 100, 2),
phq4_9_pos_uw =
base::sum(phq4_sum_rec >= 9 & phq4_sum_rec <= 12, na.rm = TRUE),
phq4_9_pos_w =
base::round(base::sum(ps_weight[phq4_sum_rec >= 9 & phq4_sum_rec <= 12 & !is.na(phq4_sum_rec)], na.rm = TRUE), 0)
)
within_phq |> print_reactable(sorted_col = "country", width = 800)plot_df <-
within_phq |>
dplyr::mutate(
country = forcats::fct_reorder(country, gad2_3_w, .desc = TRUE)
) |>
tidyr::pivot_longer(
cols = c(phq2_3_w, gad2_3_w),
names_to = "screen",
values_to = "estimate"
) |>
dplyr::mutate(
screen = dplyr::recode(
screen,
phq2_3_w = "PHQ-2 ≥ 3",
gad2_3_w = "GAD-2 ≥ 3"
)
)
ggplot2::ggplot(plot_df, ggplot2::aes(x = estimate, y = country)) +
ggplot2::geom_point() +
ggplot2::labs(
x = "Screen-positive (%)",
y = "",
) +
ggplot2::facet_wrap(~screen, ncol = 2, scales = "free_x") +
theme(panel.grid.major.y = ggplot2::element_line(
color = "#ddeded", linewidth = 0.25)
)## -------------------------------------------------------------------
## 2) p_phq4_country: PHQ-4 mean by country (+ severity cutpoints)
## -------------------------------------------------------------------
phq4_country <- df_gmh %>%
filter(!is.na(country), !is.na(phq4_sum)) %>%
group_by(country) %>%
summarise(
n = n(),
mean = mean(phq4_sum),
sd = sd(phq4_sum),
se = ifelse(n > 1, sd/sqrt(n), NA_real_),
lo = mean - 1.96*se,
hi = mean + 1.96*se,
.groups = "drop"
) %>%
arrange(mean) %>%
mutate(country = factor(country, levels = country))
overall_phq4_mean <- mean(df_gmh$phq4_sum, na.rm = TRUE)
p_phq4_country <- ggplot(phq4_country, aes(x = mean, y = fct_rev(country))) +
geom_vline(xintercept = overall_phq4_mean, linetype = "dashed", color = "grey40") +
geom_vline(xintercept = c(3,6,9), linetype = "dotted", color = "grey60") +
geom_errorbarh(aes(xmin = lo, xmax = hi), height = 0.2) +
geom_point(size = 2) +
scale_x_continuous(limits = range(c(phq4_country$lo, phq4_country$hi), na.rm = TRUE),
expand = expansion(mult = c(0.00, 0.04))) +
labs(title = "PHQ-4 mean by country",
x = "PHQ-4 (0–12), mean ± 95% CI", y = NULL) +
theme_minimal(base_size = 12) +
theme(panel.grid.minor = element_blank())Figure 6
# Plot A: Ridge plot of MPWB by PHQ-4
df_phq <- df_phq |>
dplyr::mutate(
phq4_cat2 = base::factor(
dplyr::case_when(
phq4_cat == "Normal (0–2)" ~ "0-2\nNormal",
phq4_cat == "Mild (3–5)" ~ "3-5\nMild",
phq4_cat == "Moderate (6–8)" ~ "6-8\nModerate",
phq4_cat == "Severe (9–12)" ~ "9-12\nSevere",
TRUE ~ phq4_cat
),
levels = c(
"0-2\nNormal",
"3-5\nMild",
"6-8\nModerate",
"9-12\nSevere"
)
)
)
# Palette
phq4_cols <- c(
"0-2\nNormal" = "#E6EEF7",
"3-5\nMild" = "#C9DBF0",
"6-8\nModerate" = "#9FBDE0",
"9-12\nSevere" = "#5B88C8"
)
p_ridge <-
# We are going to pass unweighted mpwb_sum to the aes because the weights
# are applied within geom_density_ridges.
ggplot(df_phq, aes(x = mpwb_sum, y = phq4_cat2, fill = phq4_cat2)) +
geom_density_ridges(
aes(weight = ps_weight),
scale = 1.5, rel_min_height = 0.01, alpha = 0.9, color = "white"
) +
scale_x_continuous(
limits = c(0, 80),
breaks = seq(0, 70, by = 10),
expand = c(0, 0)
) +
scale_fill_manual(values = phq4_cols) +
coord_cartesian(xlim = c(5, 80)) +
labs(
x = "MPWB Sum",
y = "PHQ-4\n"
) +
theme(
axis.line.x = element_blank(),
axis.text.y = element_text(color = "#051520", margin = margin(t = 1), face = "bold"),
legend.position = "none",
panel.grid.major.y = element_line(color = "#ECF3F3", linewidth = 0.4),
axis.title.y = element_text(margin = margin(r = 10), color = "#051520", face = "bold"),
axis.title.x = element_text(margin = margin(t = 10), color = "#051520", face = "bold")
) +
geom_vline(xintercept = 40, linetype = "dashed", color = "#6F7C91", linewidth = 0.6, alpha = 0.9)
# Histogram of MPWB sum
ggplot(df_phq, aes(x = mpwb_sum)) +
geom_histogram(binwidth = 1, color = "#5B88C8", fill = "#9FBDE0") +
labs(
x = "MPWB sum",
y = "Frequency"
)# Plot B: PHQ-4 and MPWB by country
svy_phq <- survey::svydesign(ids = ~ 1, weights = ~ ps_weight, data = df_phq)
means_cty <- survey::svyby(
~ phq4_sum_rec + mpwb_sum,
~ country,
svy_phq,
survey::svymean,
na.rm = TRUE,
vartype = NULL
) |>
as.data.frame()
weights_cty <- df_phq |>
dplyr::group_by(country, iso2) |>
dplyr::summarise(ps_weight = sum(ps_weight, na.rm = TRUE), .groups = "drop") |>
dplyr::mutate(
iso2 = tolower(iso2),
alpha_country = dplyr::if_else(country %in% flagged_countries, 0.5, 1)
)
means_cty <- means_cty |>
dplyr::left_join(weights_cty, by = "country")
r_w_val <- weighted_corr(means_cty, phq4_sum_rec, mpwb_sum)[[1]]
p_phq4_mpwb <-
ggplot(means_cty, aes(x = phq4_sum_rec, y = mpwb_sum)) +
geom_smooth(
method = "lm",
se = FALSE,
color = "#6F7C91",
linewidth = 0.8
) +
labs(
subtitle = bquote("Pearson's " ~ italic(r) ~ "=" ~ .(r_w_val)),
x = "PHQ-4",
y = "MPWB Sum"
) +
geom_point(
aes(alpha = alpha_country),
shape = 21,
colour = "#051520",
size = 4
) +
with_shadow(
geom_point(
aes(alpha = alpha_country),
size = 4.2,
alpha = 0.5,
stroke = 0
),
sigma = 2,
colour = "gray60",
x_offset = 1,
y_offset = 1
) +
ggflags::geom_flag(aes(country = iso2), size = 3.5, na.rm = TRUE) +
theme(
plot.subtitle = element_text(color = "#051520", family = ""),
legend.position = "none",
axis.line.x = element_blank(),
panel.grid.major.y = element_line(color = "#ECF3F3", linewidth = 0.4),
axis.title.x = element_text(
margin = margin(t = 10),
color = "#051520",
face = "bold"
),
axis.title.y = element_text(
margin = margin(r = 10),
color = "#051520",
face = "bold"
),
axis.text.x = element_text(
color = "#051520",
margin = margin(t = 1),
face = "plain"
)
) +
guides(alpha = "none") + coord_flip()Correlation between PHQ-2 and GAD-2
means_cty_gp <- survey::svyby(
~ phq2_sum_rec + gad2_sum_rec,
~ country,
svy_phq,
survey::svymean,
na.rm = TRUE,
vartype = NULL
) |>
as.data.frame()
weights_cty_gp <- df_phq |>
dplyr::group_by(country, iso2) |>
dplyr::summarise(ps_weight = sum(ps_weight, na.rm = TRUE), .groups = "drop") |>
dplyr::mutate(
iso2 = tolower(iso2),
alpha_country = dplyr::if_else(country %in% flagged_countries, 0.5, 1)
)
means_cty_gp <- means_cty_gp |>
dplyr::left_join(weights_cty, by = "country")
r_w_val <- weighted_corr(means_cty_gp, phq2_sum_rec, gad2_sum_rec)[[1]]
p_phq2_gad2 <-
ggplot(means_cty_gp, aes(x = phq2_sum_rec, y = gad2_sum_rec)) +
geom_smooth(
method = "lm",
se = FALSE,
color = "#6F7C91",
linewidth = 0.8
) +
labs(
subtitle = bquote("Pearson's " ~ italic(r) ~ "=" ~ .(r_w_val)),
x = "PHQ-2",
y = "GAD2"
) +
geom_point(
aes(alpha = alpha_country),
shape = 21,
colour = "#051520",
size = 4
) +
with_shadow(
geom_point(
aes(alpha = alpha_country),
size = 4.2,
alpha = 0.5,
stroke = 0
),
sigma = 2,
colour = "gray60",
x_offset = 1,
y_offset = 1
) +
ggflags::geom_flag(aes(country = iso2), size = 3.5, na.rm = TRUE) +
theme(
plot.subtitle = element_text(color = "#051520", family = ""),
legend.position = "none",
axis.line.x = element_blank(),
panel.grid.major.y = element_line(color = "#ECF3F3", linewidth = 0.4),
axis.title.x = element_text(
margin = margin(t = 10),
color = "#051520",
face = "bold"
),
axis.title.y = element_text(
margin = margin(r = 10),
color = "#051520",
face = "bold"
),
axis.text.x = element_text(
color = "#051520",
margin = margin(t = 1),
face = "plain"
)
) +
guides(alpha = "none") + coord_flip()Information About the R Session
─ Session info ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
setting value
version R version 4.5.2 (2025-10-31)
os macOS Sequoia 15.6
system aarch64, darwin20
ui X11
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz Europe/Amsterdam
date 2025-12-15
pandoc 3.6.3 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/aarch64/ (via rmarkdown)
quarto 1.4.549 @ /usr/local/bin/quarto
─ Packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
package * version date (UTC) lib source
abind 1.4-8 2024-09-12 [1] CRAN (R 4.5.0)
archive 1.1.12 2025-03-20 [1] CRAN (R 4.5.0)
askpass 1.2.1 2024-10-04 [1] CRAN (R 4.5.0)
backports 1.5.0 2024-05-23 [1] CRAN (R 4.5.0)
base64enc 0.1-3 2015-07-28 [1] CRAN (R 4.5.0)
binom * 1.1-1.1 2022-05-02 [1] CRAN (R 4.5.0)
bit 4.6.0 2025-03-06 [1] CRAN (R 4.5.0)
bit64 4.6.0-1 2025-01-16 [1] CRAN (R 4.5.0)
boot 1.3-32 2025-08-29 [1] CRAN (R 4.5.2)
broom 1.0.9 2025-07-28 [1] CRAN (R 4.5.0)
broom.mixed * 0.2.9.6 2024-10-15 [1] CRAN (R 4.5.0)
car * 3.1-3 2024-09-27 [1] CRAN (R 4.5.0)
carData * 3.0-5 2022-01-06 [1] CRAN (R 4.5.0)
cellranger 1.1.0 2016-07-27 [1] CRAN (R 4.5.0)
checkmate 2.3.2 2024-07-29 [1] CRAN (R 4.5.0)
chromote 0.5.1 2025-04-24 [1] CRAN (R 4.5.0)
class 7.3-23 2025-01-01 [1] CRAN (R 4.5.2)
classInt 0.4-11 2025-01-08 [1] CRAN (R 4.5.0)
cli 3.6.5 2025-04-23 [1] CRAN (R 4.5.0)
cluster 2.1.8.1 2025-03-12 [1] CRAN (R 4.5.2)
coda 0.19-4.1 2024-01-31 [1] CRAN (R 4.5.0)
codetools 0.2-20 2024-03-31 [1] CRAN (R 4.5.2)
colorspace 2.1-1 2024-07-26 [1] CRAN (R 4.5.0)
commonmark 2.0.0 2025-07-07 [1] CRAN (R 4.5.0)
corrplot * 0.95 2024-10-14 [1] CRAN (R 4.5.0)
countrycode * 1.6.1 2025-03-31 [1] CRAN (R 4.5.0)
cowplot * 1.2.0 2025-07-07 [1] CRAN (R 4.5.0)
crayon 1.5.3 2024-06-20 [1] CRAN (R 4.5.0)
crosstalk 1.2.1 2023-11-23 [1] CRAN (R 4.5.0)
curl 7.0.0 2025-08-19 [1] CRAN (R 4.5.0)
data.table 1.17.8 2025-07-10 [1] CRAN (R 4.5.0)
DBI 1.2.3 2024-06-02 [1] CRAN (R 4.5.0)
digest 0.6.37 2024-08-19 [1] CRAN (R 4.5.0)
dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.5.0)
e1071 1.7-16 2024-09-16 [1] CRAN (R 4.5.0)
emmeans * 1.11.2 2025-07-11 [1] CRAN (R 4.5.0)
estimability 1.5.1 2024-05-12 [1] CRAN (R 4.5.0)
evaluate 1.0.4 2025-06-18 [1] CRAN (R 4.5.0)
farver 2.1.2 2024-05-13 [1] CRAN (R 4.5.0)
fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.5.0)
flextable * 0.9.10 2025-08-24 [1] CRAN (R 4.5.0)
fontBitstreamVera 0.1.1 2017-02-01 [1] CRAN (R 4.5.0)
fontLiberation 0.1.0 2016-10-15 [1] CRAN (R 4.5.0)
fontquiver 0.2.1 2017-02-01 [1] CRAN (R 4.5.0)
forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.5.0)
foreach 1.5.2 2022-02-02 [1] CRAN (R 4.5.0)
foreign 0.8-90 2025-03-31 [1] CRAN (R 4.5.2)
Formula 1.2-5 2023-02-24 [1] CRAN (R 4.5.0)
fs 1.6.6 2025-04-12 [1] CRAN (R 4.5.0)
furrr 0.3.1 2022-08-15 [1] CRAN (R 4.5.0)
future 1.67.0 2025-07-29 [1] CRAN (R 4.5.0)
gdata 3.0.1 2024-10-22 [1] CRAN (R 4.5.0)
gdtools 0.4.4 2025-10-06 [1] CRAN (R 4.5.0)
generics 0.1.4 2025-05-09 [1] CRAN (R 4.5.0)
ggeffects * 2.3.0 2025-06-13 [1] CRAN (R 4.5.0)
ggflags * 0.0.4 2023-10-10 [1] https://jimjam-slam.r-universe.dev (R 4.5.1)
ggfx * 1.0.2 2025-07-24 [1] CRAN (R 4.5.0)
ggh4x * 0.3.1 2025-05-30 [1] CRAN (R 4.5.0)
ggplot2 * 4.0.0 2025-09-11 [1] CRAN (R 4.5.0)
ggplotify * 0.1.2 2023-08-09 [1] CRAN (R 4.5.0)
ggridges * 0.5.7 2025-08-27 [1] CRAN (R 4.5.0)
ggtext * 0.1.2 2022-09-16 [1] CRAN (R 4.5.0)
glmnet 4.1-10 2025-07-17 [1] CRAN (R 4.5.0)
globals 0.18.0 2025-05-08 [1] CRAN (R 4.5.0)
glue 1.8.0 2024-09-30 [1] CRAN (R 4.5.0)
gridExtra * 2.3 2017-09-09 [1] CRAN (R 4.5.0)
gridGraphics 0.5-1 2020-12-13 [1] CRAN (R 4.5.0)
gridtext 0.1.5 2022-09-16 [1] CRAN (R 4.5.0)
grImport2 0.3-3 2024-07-30 [1] CRAN (R 4.5.0)
gt 1.0.0 2025-04-05 [1] CRAN (R 4.5.0)
gtable * 0.3.6 2024-10-25 [1] CRAN (R 4.5.0)
gtools 3.9.5 2023-11-20 [1] CRAN (R 4.5.0)
haven 2.5.5 2025-05-30 [1] CRAN (R 4.5.0)
Hmisc * 5.2-4 2025-10-05 [1] CRAN (R 4.5.0)
hms 1.1.3 2023-03-21 [1] CRAN (R 4.5.0)
htmlTable 2.4.3 2024-07-21 [1] CRAN (R 4.5.0)
htmltools * 0.5.8.1 2024-04-04 [1] CRAN (R 4.5.0)
htmlwidgets 1.6.4 2023-12-06 [1] CRAN (R 4.5.0)
httpuv 1.6.16 2025-04-16 [1] CRAN (R 4.5.0)
httr 1.4.7 2023-08-15 [1] CRAN (R 4.5.0)
insight 1.4.2 2025-09-02 [1] CRAN (R 4.5.0)
interactions * 1.2.0 2024-07-29 [1] CRAN (R 4.5.0)
iterators 1.0.14 2022-02-05 [1] CRAN (R 4.5.0)
janitor * 2.2.1 2024-12-22 [1] CRAN (R 4.5.0)
jomo 2.7-6 2023-04-15 [1] CRAN (R 4.5.0)
jpeg 0.1-11 2025-03-21 [1] CRAN (R 4.5.0)
jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.5.0)
jsonlite 2.0.0 2025-03-27 [1] CRAN (R 4.5.0)
jtools 2.3.0 2024-08-25 [1] CRAN (R 4.5.0)
kableExtra * 1.4.0 2024-01-24 [1] CRAN (R 4.5.0)
KernSmooth 2.23-26 2025-01-01 [1] CRAN (R 4.5.2)
knitr 1.50 2025-03-16 [1] CRAN (R 4.5.0)
labeling 0.4.3 2023-08-29 [1] CRAN (R 4.5.0)
labelled * 2.16.0 2025-10-22 [1] CRAN (R 4.5.0)
later 1.4.2 2025-04-08 [1] CRAN (R 4.5.0)
lattice 0.22-7 2025-04-02 [1] CRAN (R 4.5.2)
lavaan * 0.6-19 2024-09-26 [1] CRAN (R 4.5.0)
lazyeval 0.2.2 2019-03-15 [1] CRAN (R 4.5.0)
leaflet * 2.2.2 2024-03-26 [1] CRAN (R 4.5.0)
leaflet.extras * 2.0.1 2024-08-19 [1] CRAN (R 4.5.0)
leaflet.extras2 * 1.3.2 2025-08-27 [1] CRAN (R 4.5.0)
leaflet.providers 2.0.0 2023-10-17 [1] CRAN (R 4.5.0)
lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.5.0)
listenv 0.9.1 2024-01-29 [1] CRAN (R 4.5.0)
litedown 0.7 2025-04-08 [1] CRAN (R 4.5.0)
lme4 * 1.1-37 2025-03-26 [1] CRAN (R 4.5.0)
lsr * 0.5.2 2021-12-01 [1] CRAN (R 4.5.0)
lubridate * 1.9.4 2024-12-08 [1] CRAN (R 4.5.0)
magick 2.8.7 2025-06-06 [1] CRAN (R 4.5.0)
magrittr 2.0.4 2025-09-12 [1] CRAN (R 4.5.0)
markdown 2.0 2025-03-23 [1] CRAN (R 4.5.0)
MASS 7.3-65 2025-02-28 [1] CRAN (R 4.5.2)
mathjaxr 2.0-0 2025-12-01 [1] CRAN (R 4.5.2)
Matrix * 1.7-4 2025-08-28 [1] CRAN (R 4.5.2)
metadat * 1.4-0 2025-02-04 [1] CRAN (R 4.5.0)
metafor * 4.8-0 2025-01-28 [1] CRAN (R 4.5.0)
MetBrewer * 0.2.0 2022-03-21 [1] CRAN (R 4.5.0)
mgcv * 1.9-3 2025-04-04 [1] CRAN (R 4.5.2)
mice 3.18.0 2025-05-27 [1] CRAN (R 4.5.0)
mime 0.13 2025-03-17 [1] CRAN (R 4.5.0)
minqa 1.2.8 2024-08-17 [1] CRAN (R 4.5.0)
mitml 0.4-5 2023-03-08 [1] CRAN (R 4.5.0)
mitools 2.4 2019-04-26 [1] CRAN (R 4.5.0)
mnormt 2.1.1 2022-09-26 [1] CRAN (R 4.5.0)
multcomp 1.4-28 2025-01-29 [1] CRAN (R 4.5.0)
mvtnorm 1.3-3 2025-01-10 [1] CRAN (R 4.5.0)
nlme * 3.1-168 2025-03-31 [1] CRAN (R 4.5.2)
nloptr 2.2.1 2025-03-17 [1] CRAN (R 4.5.0)
nnet 7.3-20 2025-01-01 [1] CRAN (R 4.5.2)
numDeriv * 2016.8-1.1 2019-06-06 [1] CRAN (R 4.5.0)
officer * 0.7.0 2025-09-03 [1] CRAN (R 4.5.0)
openssl 2.3.3 2025-05-26 [1] CRAN (R 4.5.0)
pacman * 0.5.1 2019-03-11 [1] CRAN (R 4.5.0)
pagedown 0.23 2025-08-20 [1] CRAN (R 4.5.0)
pan 1.9 2023-12-07 [1] CRAN (R 4.5.0)
pander 0.6.6 2025-03-01 [1] CRAN (R 4.5.0)
parallelly 1.45.1 2025-07-24 [1] CRAN (R 4.5.0)
pbivnorm 0.6.0 2015-01-23 [1] CRAN (R 4.5.0)
performance * 0.15.2 2025-10-06 [1] CRAN (R 4.5.0)
pillar 1.11.0 2025-07-04 [1] CRAN (R 4.5.0)
pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.5.0)
plotly 4.11.0 2025-06-19 [1] CRAN (R 4.5.0)
png 0.1-8 2022-11-29 [1] CRAN (R 4.5.0)
processx 3.8.6 2025-02-21 [1] CRAN (R 4.5.0)
promises 1.3.3 2025-05-29 [1] CRAN (R 4.5.0)
proxy 0.4-27 2022-06-09 [1] CRAN (R 4.5.0)
ps 1.9.1 2025-04-12 [1] CRAN (R 4.5.0)
psych * 2.5.6 2025-06-23 [1] CRAN (R 4.5.0)
purrr * 1.1.0 2025-07-10 [1] CRAN (R 4.5.0)
quadprog 1.5-8 2019-11-20 [1] CRAN (R 4.5.0)
qualtRics * 3.2.1 2024-08-16 [1] CRAN (R 4.5.0)
R6 2.6.1 2025-02-15 [1] CRAN (R 4.5.0)
ragg 1.4.0 2025-04-10 [1] CRAN (R 4.5.0)
rappdirs 0.3.3 2021-01-31 [1] CRAN (R 4.5.0)
rbibutils 2.3 2024-10-04 [1] CRAN (R 4.5.0)
RColorBrewer 1.1-3 2022-04-03 [1] CRAN (R 4.5.0)
Rcpp 1.1.0 2025-07-02 [1] CRAN (R 4.5.0)
Rdpack 2.6.4 2025-04-09 [1] CRAN (R 4.5.0)
reactable * 0.4.4 2023-03-12 [1] CRAN (R 4.5.0)
reactR 0.6.1 2024-09-14 [1] CRAN (R 4.5.0)
readr * 2.1.5 2024-01-10 [1] CRAN (R 4.5.0)
readxl * 1.4.5 2025-03-07 [1] CRAN (R 4.5.0)
reformulas 0.4.1 2025-04-30 [1] CRAN (R 4.5.0)
report * 0.6.1 2025-02-07 [1] CRAN (R 4.5.0)
rlang * 1.1.6 2025-04-11 [1] CRAN (R 4.5.0)
rmarkdown 2.29 2024-11-04 [1] CRAN (R 4.5.0)
rmcorr * 0.7.0 2024-07-26 [1] CRAN (R 4.5.0)
rnaturalearth * 1.1.0 2025-07-28 [1] CRAN (R 4.5.0)
rnaturalearthdata * 1.0.0 2024-02-09 [1] CRAN (R 4.5.0)
rpart 4.1.24 2025-01-07 [1] CRAN (R 4.5.2)
rstudioapi 0.17.1 2024-10-22 [1] CRAN (R 4.5.0)
S7 0.2.0 2024-11-07 [1] CRAN (R 4.5.0)
sandwich 3.1-1 2024-09-15 [1] CRAN (R 4.5.0)
sass 0.4.10 2025-04-11 [1] CRAN (R 4.5.0)
scales * 1.4.0 2025-04-24 [1] CRAN (R 4.5.0)
see * 0.11.0 2025-03-11 [1] CRAN (R 4.5.0)
semTools * 0.5-7 2025-03-13 [1] CRAN (R 4.5.0)
servr 0.32 2024-10-04 [1] CRAN (R 4.5.0)
sessioninfo * 1.2.3 2025-02-05 [1] CRAN (R 4.5.0)
sf * 1.0-21 2025-05-15 [1] CRAN (R 4.5.0)
shape 1.4.6.1 2024-02-23 [1] CRAN (R 4.5.0)
showtext * 0.9-7 2024-03-02 [1] CRAN (R 4.5.0)
showtextdb * 3.0 2020-06-04 [1] CRAN (R 4.5.0)
sjlabelled 1.2.0 2022-04-10 [1] CRAN (R 4.5.0)
sjPlot * 2.9.0 2025-07-10 [1] CRAN (R 4.5.0)
snakecase 0.11.1 2023-08-27 [1] CRAN (R 4.5.0)
srvyr 1.3.0 2024-08-19 [1] CRAN (R 4.5.0)
stringi 1.8.7 2025-03-27 [1] CRAN (R 4.5.0)
stringr * 1.5.1 2023-11-14 [1] CRAN (R 4.5.0)
survey * 4.4-8 2025-08-28 [1] CRAN (R 4.5.0)
survival * 3.8-3 2024-12-17 [1] CRAN (R 4.5.2)
svglite 2.2.1 2025-05-12 [1] CRAN (R 4.5.0)
sysfonts * 0.8.9 2024-03-02 [1] CRAN (R 4.5.0)
systemfonts 1.3.1 2025-10-01 [1] CRAN (R 4.5.0)
textshaping 1.0.1 2025-05-01 [1] CRAN (R 4.5.0)
TH.data 1.1-3 2025-01-17 [1] CRAN (R 4.5.0)
tibble * 3.3.0 2025-06-08 [1] CRAN (R 4.5.0)
tidyr * 1.3.1 2024-01-24 [1] CRAN (R 4.5.0)
tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.5.0)
timechange 0.3.0 2024-01-18 [1] CRAN (R 4.5.0)
tzdb 0.5.0 2025-03-15 [1] CRAN (R 4.5.0)
units 0.8-7 2025-03-11 [1] CRAN (R 4.5.0)
utf8 1.2.6 2025-06-08 [1] CRAN (R 4.5.0)
uuid 1.2-1 2024-07-29 [1] CRAN (R 4.5.0)
vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.5.0)
viridisLite 0.4.2 2023-05-02 [1] CRAN (R 4.5.0)
visdat * 0.6.0 2023-02-02 [1] CRAN (R 4.5.0)
vroom 1.6.5 2023-12-05 [1] CRAN (R 4.5.0)
webshot2 0.1.2 2025-04-23 [1] CRAN (R 4.5.0)
websocket 1.4.4 2025-04-10 [1] CRAN (R 4.5.0)
weights * 1.1.2 2025-06-18 [1] CRAN (R 4.5.0)
withr 3.0.2 2024-10-28 [1] CRAN (R 4.5.0)
xfun 0.52 2025-04-02 [1] CRAN (R 4.5.0)
XML 3.99-0.19 2025-08-22 [1] CRAN (R 4.5.0)
xml2 1.3.8 2025-03-14 [1] CRAN (R 4.5.0)
xtable 1.8-4 2019-04-21 [1] CRAN (R 4.5.0)
yaml 2.3.10 2024-07-26 [1] CRAN (R 4.5.0)
yulab.utils 0.2.1 2025-08-19 [1] CRAN (R 4.5.0)
zip 2.3.3 2025-05-13 [1] CRAN (R 4.5.0)
zoo 1.8-14 2025-04-10 [1] CRAN (R 4.5.0)
[1] /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library
* ── Packages attached to the search path.
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────